#### By: Peyman Shahidi
#### Created: Oct 29, 2025
#### Last Edit: Nov 8, 2025

<br>

In [211]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [212]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
# Modify the output path accordingly
output_data_path = f'{input_data_path}/computed_objects/BLS_ONET_matchedEmpShares'
output_plot_path = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares"

# Toggle: if True, randomly reassign occ_totalEmpShare weights in the merged master_df
# during the merge_industry_employment_shares step. Set to False for default behavior.
randomize_occ_weights = True


In [213]:
import os
for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

#### Main Code

In [214]:
# Read the merged data
merged_data = pd.read_csv(f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv")

In [215]:
# # Drop the supplemental tasks
# merged_data = merged_data[merged_data['Task Type'] != 'Supplemental'].reset_index(drop=True)

# # Drop rows whose Occupation Title includes 'Teachers, Postsecondary'
# merged_data = merged_data[~merged_data['Occupation Title'].str.contains('Teachers, Postsecondary')].reset_index(drop=True)

In [216]:
# Define levels and variables
bls_sector_levels = ['sector']#, '3-digit', '4-digit', '5-digit', '6-digit']

onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']

weight_cols = ['occ_totalEmpShare',         # Weight each occupation by occupation's share of total employment (ignoring sector shares)
               'sectorEmpShare',            # Weight each occupation by its sector's share of total employment
               'occ_sectorEmpShare']        # Weight each occupation by its share of employment within its sector and weight sectors equally  

dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']


In [217]:
my_sector = '4-digit'  # Choose from bls_sector_levels
my_onet_level = 'detailed'  # Choose from onet_levels
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

In [218]:
handoff_var = 'task_semantic_dispersion'
ai_outcome_var = 'ai_fraction'
ai_exposure_var = 'human_E1_fraction'
fragmentation_index_var = 'fragmentation_index'

# Create occupation-level data
occupation_analysis = pd.read_csv(f"{input_data_path}/computed_objects/fragmentationIndex/occupation_analysis_with_fragmentationIndex.csv")

# Merge with embedding-based handoff measure
occupation_task_dispersion = pd.read_csv(f"{input_data_path}/computed_objects/ONET_occupation_task_semantic_dispersion.csv")

occupation_analysis = occupation_analysis.merge(occupation_task_dispersion[[ 'O*NET-SOC Code', 'Occupation Title', 'task_semantic_dispersion']],
                                                on=['O*NET-SOC Code', 'Occupation Title'], how='left')


# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

# Aggregate to onet_occupation_code_var level
occupation_analysis = occupation_analysis.groupby([onet_occupation_code_var, onet_occupation_title_var]).agg({
    handoff_var: 'mean',
    ai_outcome_var: 'mean',
    ai_exposure_var: 'mean',
    fragmentation_index_var: 'mean',
}).reset_index()

occupation_analysis


occupation_analysis.sort_values(by=handoff_var, ascending=False)

Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,task_semantic_dispersion,ai_fraction,human_E1_fraction,fragmentation_index
311,29-1292,Dental Hygienists,0.69,0.06,0.00,1.00
439,41-9012,Models,0.68,0.00,0.07,1.00
488,43-9031,Desktop Publishers,0.68,0.44,0.06,0.78
703,51-9141,Semiconductor Processing Technicians,0.67,0.00,0.04,1.00
498,45-2041,"Graders and Sorters, Agricultural Products",0.66,0.00,0.00,1.00
...,...,...,...,...,...,...
612,49-9098,"Helpers--Installation, Maintenance, and Repair...",0.42,0.00,0.00,1.00
13,11-3131,Training and Development Managers,0.42,0.33,0.08,1.00
28,11-9161,Emergency Management Directors,0.41,0.04,0.39,1.00
256,27-2031,Dancers,0.40,0.00,0.00,1.00


# DOUBLE CHECK WEIGHT CALCULATIONS LATER

In [219]:
# Add BLS employment shares for all NAICS sectors and create a master dataset
bls_sector_shares = pd.read_csv(f'{input_data_path}/computed_objects/BLS_ONET_empShares/BLS{my_sector}_ONET{my_onet_level}_empShares.csv')

master_df = occupation_analysis.copy()

# 1) sectorEmpShare
sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'TOT_EMP']].groupby(['NAICS', 'NAICS_TITLE']).sum('TOT_EMP')

# Convert to % and change variable name
sector_weights_df['TOT_EMP'] = sector_weights_df['TOT_EMP'] / sector_weights_df['TOT_EMP'].sum()
sector_weights_df = sector_weights_df.rename(columns={'TOT_EMP': 'sectorEmpShare'})

# Merge back sector weights to bls dataset to get sector-by-sector occupation data with sector weights
bls_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE']].merge(sector_weights_df, on='NAICS', how='left')

# Add weight column to master_df
master_df = master_df.merge(bls_sector_weights_df[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'sectorEmpShare']], left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')
master_df = master_df.drop(columns=['OCC_CODE'])

# 2) occ_sectorEmpShare
within_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP']].copy()
within_sector_weights_df['occ_sectorEmpShare'] = within_sector_weights_df['TOT_EMP'] / within_sector_weights_df.groupby(['NAICS', 'NAICS_TITLE'])['TOT_EMP'].transform('sum')

# Add weight column to master_df
master_df = master_df.merge(within_sector_weights_df[['NAICS', 'OCC_CODE', 'occ_sectorEmpShare']], left_on=['NAICS', onet_occupation_code_var], right_on=['NAICS', 'OCC_CODE'], how='left')
master_df = master_df.drop(columns=['OCC_CODE'])

# Drop rows with null NAICS -- i.e., unmatched occupations in both procedures
master_df = master_df[~master_df.NAICS.isnull()]

# Save master dataframe to CSV
master_df = master_df.sort_values(by=['NAICS', 'occ_sectorEmpShare'], ascending=[True, False]).reset_index(drop=True)
out_dir = f"{output_data_path}"#/BLS{my_sector}_ONET{my_onet_level}/{dependent_var}/"
os.makedirs(out_dir, exist_ok=True)
master_out = f"{out_dir}/BLS{my_sector}_ONET{my_onet_level}.csv"
master_df.to_csv(master_out, index=False)

In [220]:
# my_sector = 'sector'  # Choose from bls_sector_levels
# my_onet_level = 'detailed'  # Choose from onet_levels
# onet_occupation_code_var = 'Detailed_Occupation_Code'
# onet_occupation_title_var = 'Detailed_Occupation_Title'
# dependent_var = 'ai_fraction'

# # Create occupation-level
# occupation_analysis = create_occupation_analysis(merged_data, onet_occupation_code_var, onet_occupation_title_var)

# # Add BLS employment shares for all NAICS sectors and create a master dataset
# bls_sector_shares = pd.read_csv(f'{input_data_path}/computed_objects/BLS_ONET_empShares/BLS{my_sector}_ONET{my_onet_level}_empShares.csv')

# master_df = occupation_analysis.copy()

# # 1) sectorEmpShare
# sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'TOT_EMP']].groupby(['NAICS', 'NAICS_TITLE']).sum('TOT_EMP')

# # Convert to % and change variable name
# sector_weights_df['TOT_EMP'] = sector_weights_df['TOT_EMP'] / sector_weights_df['TOT_EMP'].sum()
# sector_weights_df = sector_weights_df.rename(columns={'TOT_EMP': 'sectorEmpShare'})

# # Merge back sector weights to bls dataset to get sector-by-sector occupation data with sector weights
# bls_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE']].merge(sector_weights_df, on='NAICS', how='left')

# # Add weight column to master_df
# master_df = master_df.merge(bls_sector_weights_df[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'sectorEmpShare']], left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')
# master_df = master_df.drop(columns=['OCC_CODE'])

# # 2) occ_sectorEmpShare
# within_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP']].copy()

# within_sector_weights_df[['NAICS', 'NAICS_TITLE', 'TOT_EMP']].groupby(['NAICS', 'NAICS_TITLE']).sum()


## Define Weighted Hand-off Measure

In [221]:
# 1) Ensure numeric types and fill NaN with 0 (safe for summing)
# master_df['num_tasks'] = pd.to_numeric(master_df.get('num_tasks', 0), errors='coerce').fillna(0)
master_df['occ_sectorEmpShare'] = pd.to_numeric(master_df.get('occ_sectorEmpShare', 0), errors='coerce').fillna(0)

# 2) Per-row weighted value (keep this if useful downstream)
master_df[f'{handoff_var}_weighted'] = master_df[handoff_var] * master_df['occ_sectorEmpShare']
master_df[f'{ai_outcome_var}_weighted'] = master_df[ai_outcome_var] * master_df['occ_sectorEmpShare']
master_df[f'{ai_exposure_var}_weighted'] = master_df[ai_exposure_var] * master_df['occ_sectorEmpShare']
master_df[f'{fragmentation_index_var}_weighted'] = master_df[fragmentation_index_var] * master_df['occ_sectorEmpShare']

# 3) Compute group sums within NAICS / NAICS_TITLE for both columns
named_aggs = {
    'occ_sectorEmpShare_sum': ('occ_sectorEmpShare', 'sum')
    }
named_aggs[f'{handoff_var}_weighted_sum'] = (f'{handoff_var}_weighted',  'sum')
named_aggs[f'{ai_outcome_var}_weighted_sum'] = (f'{ai_outcome_var}_weighted', 'sum')
named_aggs[f'{ai_exposure_var}_weighted_sum'] = (f'{ai_exposure_var}_weighted', 'sum')
named_aggs[f'{fragmentation_index_var}_weighted_sum'] = (f'{fragmentation_index_var}_weighted', 'sum')


sector_sums = (
    master_df.groupby(['NAICS','NAICS_TITLE'], dropna=True)
             .agg(**named_aggs)
             .reset_index()
)

# Calculate Hand-off measure: num_tasks_weighted_sum / occ_sectorEmpShare_sum (Cause occ_sectorEmpShare_sum might be < 1)
sector_sums[f'{handoff_var}_weighted'] = sector_sums[f'{handoff_var}_weighted_sum'] / sector_sums['occ_sectorEmpShare_sum']
sector_sums[f'{ai_outcome_var}_weighted'] = sector_sums[f'{ai_outcome_var}_weighted_sum'] / sector_sums['occ_sectorEmpShare_sum']
sector_sums[f'{ai_exposure_var}_weighted'] = sector_sums[f'{ai_exposure_var}_weighted_sum'] / sector_sums['occ_sectorEmpShare_sum']
sector_sums[f'{fragmentation_index_var}_weighted'] = sector_sums[f'{fragmentation_index_var}_weighted_sum'] / sector_sums['occ_sectorEmpShare_sum']

# Drop intermediate sum columns
sector_sums = sector_sums.drop(columns=['occ_sectorEmpShare_sum', 
                                        f'{handoff_var}_weighted_sum', 
                                        f'{ai_outcome_var}_weighted_sum', 
                                        f'{ai_exposure_var}_weighted_sum',
                                        f'{fragmentation_index_var}_weighted_sum'])


# # Optional: sort sectors by total weighted tasks (helpful for inspection)
# sector_sums = sector_sums.sort_values(by=[f'{handoff_var}_weighted'], ascending=False).reset_index(drop=True)
sector_sums = sector_sums.sort_values(by=['NAICS'], ascending=False).reset_index(drop=True)
display(sector_sums.head())

# 4) Merge the group sums back into master_df so each row has the sector totals
master_df = master_df.sort_values(by=['NAICS', 'occ_sectorEmpShare'], ascending=[False, False]).reset_index(drop=True)
display(master_df.head())
master_df = master_df.merge(sector_sums, on=['NAICS', 'NAICS_TITLE'], how='left')
display(master_df.head())

Unnamed: 0,NAICS,NAICS_TITLE,task_semantic_dispersion_weighted,ai_fraction_weighted,human_E1_fraction_weighted,fragmentation_index_weighted
0,999300,"Local Government, excluding Schools and Hospit...",0.54,0.11,0.14,0.98
1,999200,"State Government, excluding Schools and Hospit...",0.53,0.13,0.19,0.97
2,999100,Federal Executive Branch (OEWS Designation),0.53,0.15,0.18,0.97
3,813900,"Business, Professional, Labor, Political, and ...",0.53,0.18,0.21,0.96
4,813400,Civic and Social Organizations,0.56,0.13,0.16,0.97


Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,task_semantic_dispersion,ai_fraction,human_E1_fraction,fragmentation_index,NAICS,NAICS_TITLE,sectorEmpShare,occ_sectorEmpShare,task_semantic_dispersion_weighted,ai_fraction_weighted,human_E1_fraction_weighted,fragmentation_index_weighted
0,33-3051,Police and Sheriff's Patrol Officers,0.57,0.12,0.05,0.95,999300,"Local Government, excluding Schools and Hospit...",0.03,0.1,0.06,0.01,0.0,0.1
1,33-2011,Firefighters,0.47,0.0,0.07,1.0,999300,"Local Government, excluding Schools and Hospit...",0.03,0.05,0.02,0.0,0.0,0.05
2,43-9061,"Office Clerks, General",0.52,0.38,0.24,0.86,999300,"Local Government, excluding Schools and Hospit...",0.03,0.03,0.01,0.01,0.01,0.02
3,49-9071,"Maintenance and Repair Workers, General",0.57,0.07,0.0,1.0,999300,"Local Government, excluding Schools and Hospit...",0.03,0.03,0.01,0.0,0.0,0.03
4,33-3012,Correctional Officers and Jailers,0.5,0.0,0.12,1.0,999300,"Local Government, excluding Schools and Hospit...",0.03,0.03,0.01,0.0,0.0,0.03


Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,task_semantic_dispersion,ai_fraction,human_E1_fraction,fragmentation_index,NAICS,NAICS_TITLE,sectorEmpShare,occ_sectorEmpShare,task_semantic_dispersion_weighted_x,ai_fraction_weighted_x,human_E1_fraction_weighted_x,fragmentation_index_weighted_x,task_semantic_dispersion_weighted_y,ai_fraction_weighted_y,human_E1_fraction_weighted_y,fragmentation_index_weighted_y
0,33-3051,Police and Sheriff's Patrol Officers,0.57,0.12,0.05,0.95,999300,"Local Government, excluding Schools and Hospit...",0.03,0.1,0.06,0.01,0.0,0.1,0.54,0.11,0.14,0.98
1,33-2011,Firefighters,0.47,0.0,0.07,1.0,999300,"Local Government, excluding Schools and Hospit...",0.03,0.05,0.02,0.0,0.0,0.05,0.54,0.11,0.14,0.98
2,43-9061,"Office Clerks, General",0.52,0.38,0.24,0.86,999300,"Local Government, excluding Schools and Hospit...",0.03,0.03,0.01,0.01,0.01,0.02,0.54,0.11,0.14,0.98
3,49-9071,"Maintenance and Repair Workers, General",0.57,0.07,0.0,1.0,999300,"Local Government, excluding Schools and Hospit...",0.03,0.03,0.01,0.0,0.0,0.03,0.54,0.11,0.14,0.98
4,33-3012,Correctional Officers and Jailers,0.5,0.0,0.12,1.0,999300,"Local Government, excluding Schools and Hospit...",0.03,0.03,0.01,0.0,0.0,0.03,0.54,0.11,0.14,0.98


In [None]:
# # Question: is higher hand-off measure associated with higher exposure?

# # Regress ai_exposure_var on handoff_var
# import statsmodels.api as sm
# import statsmodels.formula.api as smf

# # Keep NAICS for clustering
# reg_df = sector_sums[[
#     'NAICS',
#     f'{ai_exposure_var}_weighted',
#     f'{handoff_var}_weighted',
# ]].dropna()

# # Regression formula
# formula = (
#     f"{ai_exposure_var}_weighted ~ "
#     f"{handoff_var}_weighted"
# )
# res = smf.ols(formula, data=reg_df).fit(
#     cov_type='cluster',
#     cov_kwds={'groups': reg_df['NAICS']}
# )
# print(res.summary())


                                OLS Regression Results                                
Dep. Variable:     human_E1_fraction_weighted   R-squared:                       0.111
Model:                                    OLS   Adj. R-squared:                  0.107
Method:                         Least Squares   F-statistic:                     27.79
Date:                        Sun, 09 Nov 2025   Prob (F-statistic):           2.96e-07
Time:                                13:18:14   Log-Likelihood:                 337.21
No. Observations:                         247   AIC:                            -670.4
Df Residuals:                             245   BIC:                            -663.4
Df Model:                                   1                                         
Covariance Type:                      cluster                                         
                                        coef    std err          z      P>|z|      [0.025      0.975]
----------------------------

In [223]:
# Question: is higher hand-off measure associated with higher AI adoption, controlling for fragmentation and exposure?

# Regression: regress ai_outcome_var on handoff_var, ai_exposure_var, and fragmentation_index_var
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Keep NAICS for clustering
reg_df = sector_sums[[
    'NAICS',
    f'{ai_outcome_var}_weighted',
    f'{handoff_var}_weighted',
    f'{ai_exposure_var}_weighted',
    f'{fragmentation_index_var}_weighted'
]].dropna()

# Regression formula
formula = (
    f"{ai_outcome_var}_weighted ~ "
    f"{handoff_var}_weighted + {ai_exposure_var}_weighted + {fragmentation_index_var}_weighted"
)
res = smf.ols(formula, data=reg_df).fit(
    cov_type='cluster',
    cov_kwds={'groups': reg_df['NAICS']}
)
print(res.summary())

                             OLS Regression Results                             
Dep. Variable:     ai_fraction_weighted   R-squared:                       0.902
Model:                              OLS   Adj. R-squared:                  0.901
Method:                   Least Squares   F-statistic:                     556.6
Date:                  Sun, 09 Nov 2025   Prob (F-statistic):          2.68e-109
Time:                          13:18:14   Log-Likelihood:                 614.76
No. Observations:                   247   AIC:                            -1222.
Df Residuals:                       243   BIC:                            -1207.
Df Model:                             3                                         
Covariance Type:                cluster                                         
                                        coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------

In [225]:
# Question: is higher hand-off measure associated with higher AI adoption, controlling for fragmentation and exposure?

# Regression: regress ai_outcome_var on handoff_var, ai_exposure_var, and fragmentation_index_var
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Keep NAICS for clustering
reg_df = sector_sums[[
    'NAICS',
    f'{ai_outcome_var}_weighted',
    f'{handoff_var}_weighted',
    f'{ai_exposure_var}_weighted',
    f'{fragmentation_index_var}_weighted'
]].dropna()

# Regression formula
formula = (
    f"{ai_outcome_var}_weighted ~ "
    f"{handoff_var}_weighted * {ai_exposure_var}_weighted * {fragmentation_index_var}_weighted"
)
res = smf.ols(formula, data=reg_df).fit(
    cov_type='cluster',
    cov_kwds={'groups': reg_df['NAICS']}
)
print(res.summary())

                             OLS Regression Results                             
Dep. Variable:     ai_fraction_weighted   R-squared:                       0.926
Model:                              OLS   Adj. R-squared:                  0.924
Method:                   Least Squares   F-statistic:                     416.3
Date:                  Sun, 09 Nov 2025   Prob (F-statistic):          1.77e-132
Time:                          13:24:33   Log-Likelihood:                 649.39
No. Observations:                   247   AIC:                            -1283.
Df Residuals:                       239   BIC:                            -1255.
Df Model:                             7                                         
Covariance Type:                cluster                                         
                                                                                                coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------