## BLS Job OEWS Data

https://www.bls.gov/oes/tables.htm

In [85]:
import pandas as pd
import numpy as np
import janitor
import plotly.express as px

## Import Raw Data

In [65]:
raw_df = pd.read_excel('state_data.xlsx').clean_names()
raw_df.head()

Unnamed: 0,area,area_title,area_type,prim_state,naics,naics_title,i_group,own_code,occ_code,occ_title,...,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,annual,hourly
0,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,00-0000,All Occupations,...,21.07,30.82,47.51,23520,30660,43830,64110,98810,,
1,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,...,48.39,68.5,98.03,51100,72870,100640,142480,203900,,
2,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,...,79.04,106.69,#,104950,130950,164400,221910,#,,
3,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-1021,General and Operations Managers,...,51.12,78.26,#,50410,74720,106330,162780,#,,
4,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-1031,Legislators,...,*,*,*,18270,20950,26990,41760,63900,True,


## Filter For Job Roles

In [66]:
data_roles = raw_df[raw_df['occ_title'].str.contains('data|analyst', case=False, na=False)]['occ_title'].unique()
data_roles

array(['Management Analysts',
       'Market Research Analysts and Marketing Specialists',
       'Budget Analysts', 'Credit Analysts',
       'Financial and Investment Analysts', 'Computer Systems Analysts',
       'Information Security Analysts', 'Database Administrators',
       'Database Architects',
       'Software Quality Assurance Analysts and Testers',
       'Operations Research Analysts', 'Data Scientists',
       'News Analysts, Reporters, and Journalists', 'Data Entry Keyers'],
      dtype=object)

In [67]:
data_roles = data_roles[data_roles != 'News Analysts, Reporters, and Journalists']
data_roles

array(['Management Analysts',
       'Market Research Analysts and Marketing Specialists',
       'Budget Analysts', 'Credit Analysts',
       'Financial and Investment Analysts', 'Computer Systems Analysts',
       'Information Security Analysts', 'Database Administrators',
       'Database Architects',
       'Software Quality Assurance Analysts and Testers',
       'Operations Research Analysts', 'Data Scientists',
       'Data Entry Keyers'], dtype=object)

In [None]:
data_roles_df = raw_df[raw_df['occ_title'].isin(data_roles)]
data_roles_df = data_roles_df[['area_title', 'prim_state', 'occ_title', 'tot_emp', 'h_mean', 'a_mean', 'h_median', 'a_median']].reset_index()
data_roles_df.head()

Unnamed: 0,area_title,prim_state,occ_title,tot_emp,h_mean,a_mean,h_median,a_median
46,Alabama,AL,Management Analysts,4980,53.5,111280,49.27,102480
51,Alabama,AL,Market Research Analysts and Marketing Special...,8780,32.13,66840,26.63,55390
55,Alabama,AL,Budget Analysts,940,47.93,99700,47.16,98080
56,Alabama,AL,Credit Analysts,310,49.66,103280,48.58,101050
57,Alabama,AL,Financial and Investment Analysts,2780,51.86,107880,45.51,94660


## Clean Data

In [69]:
mask = data_roles_df.astype(str).apply(lambda col: col.str.fullmatch(r'\*{2,}')).any(axis=1)

In [70]:
data_roles_df[mask]

Unnamed: 0,area_title,prim_state,occ_title,tot_emp,h_mean,a_mean,h_median,a_median
809,Alaska,AK,Software Quality Assurance Analysts and Testers,**,49.17,102280,49.6,103160
5119,Delaware,DE,Database Architects,**,72.14,150060,70.4,146430
13548,Maine,ME,Financial and Investment Analysts,**,45.57,94780,44.16,91840
18777,Montana,MT,Information Security Analysts,**,47.86,99560,41.87,87100
29926,South Dakota,SD,Data Entry Keyers,**,17.98,37390,18.34,38140
36123,Wyoming,WY,Information Security Analysts,**,58.93,122570,58.31,121290
36134,Wyoming,WY,Data Scientists,**,41.1,85480,46.08,95840


In [71]:
data_roles_cleaned_df = data_roles_df.replace(r'^\*+$', np.nan, regex=True)

  data_roles_cleaned_df = data_roles_df.replace(r'^\*+$', np.nan, regex=True)


In [72]:
mask = data_roles_cleaned_df.astype(str).apply(lambda col: col.str.fullmatch(r'\*{2,}')).any(axis=1)
data_roles_cleaned_df[mask]

Unnamed: 0,area_title,prim_state,occ_title,tot_emp,h_mean,a_mean,h_median,a_median


In [73]:
## Transform Data Types

In [75]:
data_roles_cleaned_df.dtypes

area_title     object
prim_state     object
occ_title      object
tot_emp       float64
h_mean        float64
a_mean        float64
h_median      float64
a_median      float64
dtype: object

## Descriptive Stats

In [83]:
data_roles_cleaned_df

Unnamed: 0,area_title,prim_state,occ_title,tot_emp,h_mean,a_mean,h_median,a_median
46,Alabama,AL,Management Analysts,4980.0,53.50,111280.0,49.27,102480.0
51,Alabama,AL,Market Research Analysts and Marketing Special...,8780.0,32.13,66840.0,26.63,55390.0
55,Alabama,AL,Budget Analysts,940.0,47.93,99700.0,47.16,98080.0
56,Alabama,AL,Credit Analysts,310.0,49.66,103280.0,48.58,101050.0
57,Alabama,AL,Financial and Investment Analysts,2780.0,51.86,107880.0,45.51,94660.0
...,...,...,...,...,...,...,...,...
37444,Virgin Islands,VI,Market Research Analysts and Marketing Special...,120.0,23.94,49790.0,24.10,50130.0
37447,Virgin Islands,VI,Budget Analysts,70.0,31.11,64720.0,29.83,62050.0
37448,Virgin Islands,VI,Financial and Investment Analysts,40.0,40.22,83660.0,33.65,69990.0
37450,Virgin Islands,VI,Computer Systems Analysts,50.0,31.50,65530.0,27.24,56650.0


In [82]:
total_emp_df = data_roles_cleaned_df.groupby('occ_title', as_index=False).agg({'tot_emp':'sum'})    # 'area_title', 'prim_state',
total_emp_df

Unnamed: 0,occ_title,tot_emp
0,Budget Analysts,47600.0
1,Computer Systems Analysts,499030.0
2,Credit Analysts,68100.0
3,Data Entry Keyers,137980.0
4,Data Scientists,234350.0
5,Database Administrators,73330.0
6,Database Architects,64430.0
7,Financial and Investment Analysts,342060.0
8,Information Security Analysts,179590.0
9,Management Analysts,897610.0


In [86]:
# --- Choropleth with role dropdown ---
# Build one frame per role for animation OR use dropdown filtering
roles = data_roles_cleaned_df['occ_title'].sort_values().unique().tolist()

fig_map = px.choropleth(
    data_roles_cleaned_df[data_roles_cleaned_df['occ_title'] == roles[0]],
    locations='prim_state',
    locationmode='USA-states',
    color='a_mean',
    color_continuous_scale='Viridis',
    scope='usa',
    hover_name='area_title',
    hover_data={'prim_state': True, 'a_mean': ':.0f'},
    title=f"Average annual wage by state • Role: {roles[0]}"
)

# Add dropdown to switch role
dropdown_buttons = []
for r in roles:
    filtered = data_roles_cleaned_df[data_roles_cleaned_df['occ_title'] == r]
    dropdown_buttons.append({
        'label': r,
        'method': 'restyle',
        'args': [{'z':[filtered['a_mean']],
                  'locations':[filtered['prim_state']],
                  'hovertext':[filtered['area_title']]},
                 [0]]  # trace index 0
    })

fig_map.update_layout(
    updatemenus=[{
        'buttons': dropdown_buttons,
        'direction': 'down',
        'x': 0.02, 'y': 1.05,
        'xanchor': 'left', 'yanchor': 'top',
        'showactive': True
    }],
    coloraxis_colorbar=dict(title="Avg wage ($)")
)

fig_map.show()

In [95]:
import plotly.express as px
import pandas as pd

# --- Define list of valid U.S. state abbreviations ---
us_states = [
    'AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY',
    'LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND',
    'OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY'
]

# --- Filter only U.S. states ---
filtered_df = data_roles_cleaned_df[data_roles_cleaned_df['prim_state'].isin(us_states)].copy()

# --- Create alphabetic state order ---
state_order = sorted(us_states)

# --- Pivot table for median wages ---
pivot = (
    filtered_df
      .pivot_table(index='occ_title', columns='prim_state', values='a_mean', aggfunc='median')
      .sort_index()  # alphabetize roles
)

# --- Align to alphabetic state order ---
pivot = pivot[state_order]

# --- Plot heatmap ---
fig_heat = px.imshow(
    pivot,
    aspect='auto',
    color_continuous_scale='IceFire',
    origin='lower'
)

fig_heat.update_layout(
    title="Median annual wage by role and state",
    xaxis_title="State (alphabetical)",
    yaxis_title="Role (alphabetical)",
    yaxis_autorange='reversed',
    coloraxis_colorbar=dict(title="Median wage ($)")
)

fig_heat.show()


