In [1]:
import pandas as pd
from sqlalchemy import create_engine

# Step 1: Extract


In [2]:
# Load the data into a Pandas DataFrame
file_path = 'data/all_data_M_2023.xlsx'
df = pd.read_excel(file_path, sheet_name='All May 2023 data')

In [3]:
# Load the "Field Descriptions" sheet to get the mappings
descriptions_df = pd.read_excel(file_path, sheet_name='Field Descriptions')

In [4]:
# Load the "Field Descriptions" sheet and process it
descriptions_df = pd.read_excel(file_path, sheet_name='Field Descriptions', skiprows=9)
descriptions_df.columns = ['Field', 'Field Description', 'Unused']
descriptions_df = descriptions_df.drop(columns=['Unused'])

# Display the first few rows to confirm the structure
descriptions_df.head()

Unnamed: 0,Field,Field Description
0,area,"U.S. (99), state FIPS code, Metropolitan Stati..."
1,area_title,Area name
2,area_type,Area type: 1= U.S.; 2= State; 3= U.S. Territor...
3,prim_state,"The primary state for the given area. ""US"" is ..."
4,naics,North American Industry Classification System ...


In [5]:
# Create a dictionary to map fields to descriptions
descriptions_dict = dict(zip(descriptions_df['Field'], descriptions_df['Field Description']))


In [13]:
# Define the columns we are keeping and map to their appropriate descriptions
columns_to_keep = ['PRIM_STATE', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE', 'O_GROUP', 'TOT_EMP', 'A_MEDIAN', 'H_MEAN', 'A_MEAN', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'A_PCT10', 'A_PCT25', 'A_PCT75', 'A_PCT90']
renamed_columns = {col: descriptions_dict.get(col.lower(), col) for col in columns_to_keep}  # Using .lower() to handle case differences


# Step 2: Transform the data

In [14]:
# Keep only the relevant columns and rename them based on the descriptions
df_cleaned = df[columns_to_keep].dropna(subset=columns_to_keep)
df_cleaned = df_cleaned.rename(columns=renamed_columns)
df_cleaned.head()

Unnamed: 0,"The primary state for the given area. ""US"" is used for the national estimates.",North American Industry Classification System (NAICS) title for the given industry,The 6-digit Standard Occupational Classification (SOC) code or OEWS-specific code for the occupation,SOC title or OEWS-specific title for the occupation,"SOC occupation level. For most occupations, this field indicates the standard SOC major, minor, broad, and detailed levels, in addition to all-occupations totals. For occupations that OEWS no longer publishes at the SOC detailed level, the “detailed” designation indicates the most detailed data available: either a standard SOC broad occupation or an OEWS-specific combination of detailed occupations. Occupations that OEWS has aggregated to the SOC broad occupation level will appear in the file twice, once with the “broad” and once with the “detailed” designation.",Estimated total employment rounded to the nearest 10 (excludes self-employed).,Annual median wage (or the 50th percentile),Mean hourly wage,Mean annual wage,Hourly 10th percentile wage,Hourly 25th percentile wage,Hourly median wage (or the 50th percentile),Annual 10th percentile wage,Annual 25th percentile wage,Annual 75th percentile wage,Annual 90th percentile wage
0,US,Cross-industry,00-0000,All Occupations,total,151853870,48060,31.48,65470,13.97,17.14,23.11,29050,35660,76980,121470
1,US,Cross-industry,11-0000,Management Occupations,major,10495770,116880,66.23,137750,26.23,37.66,56.19,54550,78330,169090,231620
2,US,Cross-industry,11-1000,Top Executives,minor,3751510,103460,65.43,136100,22.31,31.81,49.74,46400,66170,165500,#
3,US,Cross-industry,11-1010,Chief Executives,broad,211230,206680,124.47,258900,38.46,62.9,99.37,80000,130840,#,#
4,US,Cross-industry,11-1011,Chief Executives,detailed,211230,206680,124.47,258900,38.46,62.9,99.37,80000,130840,#,#


In [17]:
# Define the simplified column names as per the user's request
simplified_column_names = [
    'Primary State', 
    'Industry Title', 
    'Occupation Code', 
    'Occupation Title', 
    'Occupation Group', 
    'Employment Estimate', 
    'Annual Median Wage', 
    'Mean Hourly Wage', 
    'Mean Annual Wage', 
    'Hourly 10th Percentile Wage', 
    'Hourly 25th Percentile Wage', 
    'Hourly Median Wage', 
    'Annual 10th Percentile Wage', 
    'Annual 25th Percentile Wage', 
    'Annual 75th Percentile Wage', 
    'Annual 90th Percentile Wage'
]

# Define the columns we are keeping
columns_to_keep = ['PRIM_STATE', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE', 'O_GROUP', 'TOT_EMP', 'A_MEDIAN', 'H_MEAN', 'A_MEAN', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'A_PCT10', 'A_PCT25', 'A_PCT75', 'A_PCT90']

# Map the original column names to simplified names
renamed_columns = dict(zip(columns_to_keep, simplified_column_names))

# Keep only the relevant columns and rename them based on the simplified names
df_cleaned = df[columns_to_keep].dropna(subset=columns_to_keep)
df_cleaned = df_cleaned.rename(columns=renamed_columns)

# Display the first few rows to confirm the renaming
df_cleaned.head()

Unnamed: 0,Primary State,Industry Title,Occupation Code,Occupation Title,Occupation Group,Employment Estimate,Annual Median Wage,Mean Hourly Wage,Mean Annual Wage,Hourly 10th Percentile Wage,Hourly 25th Percentile Wage,Hourly Median Wage,Annual 10th Percentile Wage,Annual 25th Percentile Wage,Annual 75th Percentile Wage,Annual 90th Percentile Wage
0,US,Cross-industry,00-0000,All Occupations,total,151853870,48060,31.48,65470,13.97,17.14,23.11,29050,35660,76980,121470
1,US,Cross-industry,11-0000,Management Occupations,major,10495770,116880,66.23,137750,26.23,37.66,56.19,54550,78330,169090,231620
2,US,Cross-industry,11-1000,Top Executives,minor,3751510,103460,65.43,136100,22.31,31.81,49.74,46400,66170,165500,#
3,US,Cross-industry,11-1010,Chief Executives,broad,211230,206680,124.47,258900,38.46,62.9,99.37,80000,130840,#,#
4,US,Cross-industry,11-1011,Chief Executives,detailed,211230,206680,124.47,258900,38.46,62.9,99.37,80000,130840,#,#


# Step 3: Load the Data

In [18]:
# Create a connection to a SQLite database (you can replace this with another DB type if needed)
engine = create_engine('sqlite:///EmploymentWageData.db')

In [19]:
# Export the SQLite data to CSV
df_cleaned.to_csv('data/EmploymentWageData.csv', index=False)