#### By: Peyman Shahidi
#### Created: Oct 29, 2025
#### Last Edit: Nov 8, 2025

<br>

In [277]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [278]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
# Modify the output path accordingly
output_data_path = f'{input_data_path}/computed_objects/BLS_ONET_matchedEmpShares'
output_plot_path = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares"

# Toggle: if True, randomly reassign occ_totalEmpShare weights in the merged master_df
# during the merge_industry_employment_shares step. Set to False for default behavior.
randomize_occ_weights = True


In [279]:
import os
for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

#### Main Code

In [280]:
# Read the merged data
merged_data = pd.read_csv(f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv")

In [281]:
# # Drop the supplemental tasks
# merged_data = merged_data[merged_data['Task Type'] != 'Supplemental'].reset_index(drop=True)

# # Drop rows whose Occupation Title includes 'Teachers, Postsecondary'
# merged_data = merged_data[~merged_data['Occupation Title'].str.contains('Teachers, Postsecondary')].reset_index(drop=True)

In [282]:
# Define levels and variables
bls_sector_levels = ['sector']#, '3-digit', '4-digit', '5-digit', '6-digit']

onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']

weight_cols = ['occ_totalEmpShare',         # Weight each occupation by occupation's share of total employment (ignoring sector shares)
               'sectorEmpShare',            # Weight each occupation by its sector's share of total employment
               'occ_sectorEmpShare']        # Weight each occupation by its share of employment within its sector and weight sectors equally  

dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']


In [283]:
my_sector = '5-digit'  # Choose from bls_sector_levels
my_onet_level = 'detailed'  # Choose from onet_levels
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

In [284]:
handoff_var = 'num_tasks'
ai_outcome_var = 'ai_fraction'
ai_exposure_var = 'human_E1_fraction'
fragmentation_index_var = 'fragmentation_index'

# Create occupation-level data
occupation_analysis = pd.read_csv(f"{input_data_path}/computed_objects/fragmentationIndex/occupation_analysis_with_fragmentationIndex.csv")

# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

# Aggregate to onet_occupation_code_var level
occupation_analysis = occupation_analysis.groupby([onet_occupation_code_var, onet_occupation_title_var]).agg({
    handoff_var: 'mean',
    ai_outcome_var: 'mean',
    ai_exposure_var: 'mean',
    fragmentation_index_var: 'mean',
}).reset_index()

occupation_analysis

Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,num_tasks,ai_fraction,human_E1_fraction,fragmentation_index
0,11-1011,Chief Executives,24.50,0.27,0.11,0.96
1,11-1021,General and Operations Managers,17.00,0.06,0.12,1.00
2,11-2011,Advertising and Promotions Managers,21.00,0.14,0.29,1.00
3,11-2021,Marketing Managers,20.00,0.30,0.25,1.00
4,11-2022,Sales Managers,17.00,0.18,0.12,1.00
...,...,...,...,...,...,...
753,53-7071,Gas Compressor and Gas Pumping Station Operators,13.00,0.00,0.00,1.00
754,53-7072,"Pump Operators, Except Wellhead Pumpers",14.00,0.00,0.00,1.00
755,53-7073,Wellhead Pumpers,16.00,0.00,0.00,1.00
756,53-7081,Refuse and Recyclable Material Collectors,16.00,0.00,0.12,1.00


# DOUBLE CHECK WEIGHT CALCULATIONS LATER

In [285]:
# Add BLS employment shares for all NAICS sectors and create a master dataset
bls_sector_shares = pd.read_csv(f'{input_data_path}/computed_objects/BLS_ONET_empShares/BLS{my_sector}_ONET{my_onet_level}_empShares.csv')

master_df = occupation_analysis.copy()

# 1) sectorEmpShare
sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'TOT_EMP']].groupby(['NAICS', 'NAICS_TITLE']).sum('TOT_EMP')

# Convert to % and change variable name
sector_weights_df['TOT_EMP'] = sector_weights_df['TOT_EMP'] / sector_weights_df['TOT_EMP'].sum()
sector_weights_df = sector_weights_df.rename(columns={'TOT_EMP': 'sectorEmpShare'})

# Merge back sector weights to bls dataset to get sector-by-sector occupation data with sector weights
bls_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE']].merge(sector_weights_df, on='NAICS', how='left')

# Add weight column to master_df
master_df = master_df.merge(bls_sector_weights_df[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'sectorEmpShare']], left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')
master_df = master_df.drop(columns=['OCC_CODE'])

# 2) occ_sectorEmpShare
within_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP']].copy()
within_sector_weights_df['occ_sectorEmpShare'] = within_sector_weights_df['TOT_EMP'] / within_sector_weights_df.groupby(['NAICS', 'NAICS_TITLE'])['TOT_EMP'].transform('sum')

# Add weight column to master_df
master_df = master_df.merge(within_sector_weights_df[['NAICS', 'OCC_CODE', 'occ_sectorEmpShare']], left_on=['NAICS', onet_occupation_code_var], right_on=['NAICS', 'OCC_CODE'], how='left')
master_df = master_df.drop(columns=['OCC_CODE'])

# Drop rows with null NAICS -- i.e., unmatched occupations in both procedures
master_df = master_df[~master_df.NAICS.isnull()]

# Save master dataframe to CSV
master_df = master_df.sort_values(by=['NAICS', 'occ_sectorEmpShare'], ascending=[True, False]).reset_index(drop=True)
out_dir = f"{output_data_path}"#/BLS{my_sector}_ONET{my_onet_level}/{dependent_var}/"
os.makedirs(out_dir, exist_ok=True)
master_out = f"{out_dir}/BLS{my_sector}_ONET{my_onet_level}.csv"
master_df.to_csv(master_out, index=False)

In [286]:
# my_sector = 'sector'  # Choose from bls_sector_levels
# my_onet_level = 'detailed'  # Choose from onet_levels
# onet_occupation_code_var = 'Detailed_Occupation_Code'
# onet_occupation_title_var = 'Detailed_Occupation_Title'
# dependent_var = 'ai_fraction'

# # Create occupation-level
# occupation_analysis = create_occupation_analysis(merged_data, onet_occupation_code_var, onet_occupation_title_var)

# # Add BLS employment shares for all NAICS sectors and create a master dataset
# bls_sector_shares = pd.read_csv(f'{input_data_path}/computed_objects/BLS_ONET_empShares/BLS{my_sector}_ONET{my_onet_level}_empShares.csv')

# master_df = occupation_analysis.copy()

# # 1) sectorEmpShare
# sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'TOT_EMP']].groupby(['NAICS', 'NAICS_TITLE']).sum('TOT_EMP')

# # Convert to % and change variable name
# sector_weights_df['TOT_EMP'] = sector_weights_df['TOT_EMP'] / sector_weights_df['TOT_EMP'].sum()
# sector_weights_df = sector_weights_df.rename(columns={'TOT_EMP': 'sectorEmpShare'})

# # Merge back sector weights to bls dataset to get sector-by-sector occupation data with sector weights
# bls_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE']].merge(sector_weights_df, on='NAICS', how='left')

# # Add weight column to master_df
# master_df = master_df.merge(bls_sector_weights_df[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'sectorEmpShare']], left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')
# master_df = master_df.drop(columns=['OCC_CODE'])

# # 2) occ_sectorEmpShare
# within_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP']].copy()

# within_sector_weights_df[['NAICS', 'NAICS_TITLE', 'TOT_EMP']].groupby(['NAICS', 'NAICS_TITLE']).sum()


## Define Weighted Hand-off Measure

In [287]:
# 1) Ensure numeric types and fill NaN with 0 (safe for summing)
master_df['num_tasks'] = pd.to_numeric(master_df.get('num_tasks', 0), errors='coerce').fillna(0)
master_df['occ_sectorEmpShare'] = pd.to_numeric(master_df.get('occ_sectorEmpShare', 0), errors='coerce').fillna(0)

# 2) Per-row weighted value (keep this if useful downstream)
master_df[f'{handoff_var}_weighted'] = master_df[handoff_var] * master_df['occ_sectorEmpShare']
master_df[f'{ai_outcome_var}_weighted'] = master_df[ai_outcome_var] * master_df['occ_sectorEmpShare']
master_df[f'{ai_exposure_var}_weighted'] = master_df[ai_exposure_var] * master_df['occ_sectorEmpShare']
master_df[f'{fragmentation_index_var}_weighted'] = master_df[fragmentation_index_var] * master_df['occ_sectorEmpShare']

# 3) Compute group sums within NAICS / NAICS_TITLE for both columns
named_aggs = {
    'occ_sectorEmpShare_sum': ('occ_sectorEmpShare', 'sum')
    }
named_aggs[f'{handoff_var}_weighted_sum'] = (f'{handoff_var}_weighted',  'sum')
named_aggs[f'{ai_outcome_var}_weighted_sum'] = (f'{ai_outcome_var}_weighted', 'sum')
named_aggs[f'{ai_exposure_var}_weighted_sum'] = (f'{ai_exposure_var}_weighted', 'sum')
named_aggs[f'{fragmentation_index_var}_weighted_sum'] = (f'{fragmentation_index_var}_weighted', 'sum')


sector_sums = (
    master_df.groupby(['NAICS','NAICS_TITLE'], dropna=True)
             .agg(**named_aggs)
             .reset_index()
)

# Calculate Hand-off measure: num_tasks_weighted_sum / occ_sectorEmpShare_sum (Cause occ_sectorEmpShare_sum might be < 1)
sector_sums[f'{handoff_var}_weighted'] = sector_sums[f'{handoff_var}_weighted_sum'] / sector_sums['occ_sectorEmpShare_sum']
sector_sums[f'{ai_outcome_var}_weighted'] = sector_sums[f'{ai_outcome_var}_weighted_sum'] / sector_sums['occ_sectorEmpShare_sum']
sector_sums[f'{ai_exposure_var}_weighted'] = sector_sums[f'{ai_exposure_var}_weighted_sum'] / sector_sums['occ_sectorEmpShare_sum']
sector_sums[f'{fragmentation_index_var}_weighted'] = sector_sums[f'{fragmentation_index_var}_weighted_sum'] / sector_sums['occ_sectorEmpShare_sum']

# Drop intermediate sum columns
sector_sums = sector_sums.drop(columns=['occ_sectorEmpShare_sum', 
                                        f'{handoff_var}_weighted_sum', 
                                        f'{ai_outcome_var}_weighted_sum', 
                                        f'{ai_exposure_var}_weighted_sum',
                                        f'{fragmentation_index_var}_weighted_sum'])


# Optional: sort sectors by total weighted tasks (helpful for inspection)
sector_sums = sector_sums.sort_values(by=[f'{handoff_var}_weighted'], ascending=False).reset_index(drop=True)

# 4) Merge the group sums back into master_df so each row has the sector totals
master_df = master_df.merge(sector_sums, on=['NAICS', 'NAICS_TITLE'], how='left')

# 5) Quick checks: display the sector-level sums and the head of master_df
sector_sums
# master_df.head()

Unnamed: 0,NAICS,NAICS_TITLE,num_tasks_weighted,ai_fraction_weighted,human_E1_fraction_weighted,fragmentation_index_weighted
0,541920.0,Photographic Services,26.15,0.16,0.1,0.95
1,561730.0,Landscaping Services,25.97,0.06,0.04,0.99
2,238110.0,Poured Concrete Foundation and Structure Contr...,25.61,0.05,0.04,0.99
3,813930.0,Labor Unions and Similar Labor Organizations,25.45,0.13,0.2,0.97
4,541940.0,Veterinary Services,25.35,0.12,0.16,0.97
5,238160.0,Roofing Contractors,24.9,0.06,0.06,0.99
6,811110.0,Automotive Mechanical and Electrical Repair an...,24.77,0.1,0.09,0.98
7,332710.0,Machine Shops,24.69,0.06,0.06,0.99
8,238220.0,"Plumbing, Heating, and Air-Conditioning Contra...",23.87,0.07,0.08,0.99
9,339910.0,Jewelry and Silverware Manufacturing,23.86,0.1,0.09,0.97


In [288]:
# Question: is higher hand-off measure associated with higher exposure?

# Regress ai_exposure_var on handoff_var
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Keep NAICS for clustering
reg_df = sector_sums[[
    'NAICS',
    f'{ai_exposure_var}_weighted',
    f'{handoff_var}_weighted',
]].dropna()

# Regression formula
formula = (
    f"{ai_exposure_var}_weighted ~ "
    f"{handoff_var}_weighted"
)
res = smf.ols(formula, data=reg_df).fit(
    cov_type='cluster',
    cov_kwds={'groups': reg_df['NAICS']}
)
print(res.summary())


                                OLS Regression Results                                
Dep. Variable:     human_E1_fraction_weighted   R-squared:                       0.059
Model:                                    OLS   Adj. R-squared:                  0.040
Method:                         Least Squares   F-statistic:                     4.103
Date:                        Sat, 08 Nov 2025   Prob (F-statistic):             0.0480
Time:                                17:16:27   Log-Likelihood:                 64.043
No. Observations:                          52   AIC:                            -124.1
Df Residuals:                              50   BIC:                            -120.2
Df Model:                                   1                                         
Covariance Type:                      cluster                                         
                         coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------

In [289]:
# Question: is higher hand-off measure associated with higher AI adoption, controlling for fragmentation and exposure?

# Regression: regress ai_outcome_var on handoff_var, ai_exposure_var, and fragmentation_index_var
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Keep NAICS for clustering
reg_df = sector_sums[[
    'NAICS',
    f'{ai_outcome_var}_weighted',
    f'{handoff_var}_weighted',
    f'{ai_exposure_var}_weighted',
    f'{fragmentation_index_var}_weighted'
]].dropna()

# Regression formula
formula = (
    f"{ai_outcome_var}_weighted ~ "
    f"{handoff_var}_weighted + {ai_exposure_var}_weighted + {fragmentation_index_var}_weighted"
)
res = smf.ols(formula, data=reg_df).fit(
    cov_type='cluster',
    cov_kwds={'groups': reg_df['NAICS']}
)
print(res.summary())

                             OLS Regression Results                             
Dep. Variable:     ai_fraction_weighted   R-squared:                       0.865
Model:                              OLS   Adj. R-squared:                  0.856
Method:                   Least Squares   F-statistic:                     96.61
Date:                  Sat, 08 Nov 2025   Prob (F-statistic):           4.92e-21
Time:                          17:16:27   Log-Likelihood:                 128.11
No. Observations:                    52   AIC:                            -248.2
Df Residuals:                        48   BIC:                            -240.4
Df Model:                             3                                         
Covariance Type:                cluster                                         
                                   coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------