#### By: Peyman Shahidi
#### Created: Oct 29, 2025
#### Last Edit: Nov 3, 2025

<br>

In [134]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [135]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
# Modify the output path accordingly
output_data_path = f'{input_data_path}/computed_objects/BLS_ONET_matchedEmpShares'
output_plot_path = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares"

# Toggle: if True, randomly reassign occ_totalEmpShare weights in the merged master_df
# during the merge_industry_employment_shares step. Set to False for default behavior.
randomize_occ_weights = True


In [136]:
import os
for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

## Functions

In [137]:
def create_occupation_analysis(df, onet_occupation_code_var, onet_occupation_title_var):
    # Create occupation-level analysis for scatter plots
    # Group by occupation and calculate label fractions and task counts
    occupation_stats = []

    for (soc_code, occ_title), group in df.groupby([onet_occupation_code_var, onet_occupation_title_var]):
        num_tasks = group['Task ID'].nunique()
        # num_occupations = group[onet_occupation_code_var].nunique()
        total_tasks = len(group)
        
        manual_fraction = (group['label'] == 'Manual').sum() / total_tasks
        augmentation_fraction = (group['label'] == 'Augmentation').sum() / total_tasks  
        automation_fraction = (group['label'] == 'Automation').sum() / total_tasks
        ai_fraction = augmentation_fraction + automation_fraction
        gpt4_E0_fraction = (group['gpt4_exposure'] == 'E0').sum() / total_tasks
        gpt4_E1_fraction = (group['gpt4_exposure'] == 'E1').sum() / total_tasks
        gpt4_E2_fraction = (group['gpt4_exposure'] == 'E2').sum() / total_tasks
        gpt4_aiExposure_fraction = gpt4_E1_fraction + gpt4_E2_fraction
        human_E0_fraction = (group['human_labels'] == 'E0').sum() / total_tasks
        human_E1_fraction = (group['human_labels'] == 'E1').sum() / total_tasks
        human_E2_fraction = (group['human_labels'] == 'E2').sum() / total_tasks
        human_aiExposure_fraction = human_E1_fraction + human_E2_fraction

        
        occupation_stats.append({
            f'{onet_occupation_code_var}': soc_code,
            f'{onet_occupation_title_var}': occ_title,
            'num_tasks': num_tasks,
            # 'num_occupations': num_occupations,
            'manual_fraction': manual_fraction,
            'ai_fraction': ai_fraction,
            'augmentation_fraction': augmentation_fraction,
            'automation_fraction': automation_fraction,
            'gpt4_E0_fraction': gpt4_E0_fraction,
            'gpt4_E1_fraction': gpt4_E1_fraction,
            'gpt4_E2_fraction': gpt4_E2_fraction,
            'gpt4_aiExposure_fraction': gpt4_aiExposure_fraction,
            'human_E0_fraction': human_E0_fraction,
            'human_E1_fraction': human_E1_fraction,
            'human_E2_fraction': human_E2_fraction,
            'human_aiExposure_fraction': human_aiExposure_fraction
        })

    occupation_analysis = pd.DataFrame(occupation_stats)

    return occupation_analysis

#### Main Code

In [138]:
# Read the merged data
merged_data = pd.read_csv(f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv")

In [139]:
# # Drop the supplemental tasks
# merged_data = merged_data[merged_data['Task Type'] != 'Supplemental'].reset_index(drop=True)

# # Drop rows whose Occupation Title includes 'Teachers, Postsecondary'
# merged_data = merged_data[~merged_data['Occupation Title'].str.contains('Teachers, Postsecondary')].reset_index(drop=True)

In [140]:
# Define levels and variables
bls_sector_levels = ['sector']#, '3-digit', '4-digit', '5-digit', '6-digit']

onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']

weight_cols = ['occ_totalEmpShare',         # Weight each occupation by occupation's share of total employment (ignoring sector shares)
               'sectorEmpShare',            # Weight each occupation by its sector's share of total employment
               'occ_sectorEmpShare']        # Weight each occupation by its share of employment within its sector and weight sectors equally  

dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']


In [141]:
my_sector = '3-digit'  # Choose from bls_sector_levels
my_onet_level = 'detailed'  # Choose from onet_levels
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'
dependent_var = 'ai_fraction'

# Create occupation-level
occupation_analysis = create_occupation_analysis(merged_data, onet_occupation_code_var, onet_occupation_title_var)

# Add BLS employment shares for all NAICS sectors and create a master dataset
bls_sector_shares = pd.read_csv(f'{input_data_path}/computed_objects/BLS_ONET_empShares/BLS{my_sector}_ONET{my_onet_level}_empShares.csv')

master_df = occupation_analysis.copy()

# 1) sectorEmpShare
sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'TOT_EMP']].groupby(['NAICS', 'NAICS_TITLE']).sum('TOT_EMP')

# Convert to % and change variable name
sector_weights_df['TOT_EMP'] = sector_weights_df['TOT_EMP'] / sector_weights_df['TOT_EMP'].sum()
sector_weights_df = sector_weights_df.rename(columns={'TOT_EMP': 'sectorEmpShare'})

# Merge back sector weights to bls dataset to get sector-by-sector occupation data with sector weights
bls_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE']].merge(sector_weights_df, on='NAICS', how='left')

# Add weight column to master_df
master_df = master_df.merge(bls_sector_weights_df[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'sectorEmpShare']], left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')
master_df = master_df.drop(columns=['OCC_CODE'])

# 2) occ_sectorEmpShare
within_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP']].copy()
within_sector_weights_df['occ_sectorEmpShare'] = within_sector_weights_df['TOT_EMP'] / within_sector_weights_df.groupby(['NAICS', 'NAICS_TITLE'])['TOT_EMP'].transform('sum')

# Add weight column to master_df
master_df = master_df.merge(within_sector_weights_df[['NAICS', 'OCC_CODE', 'occ_sectorEmpShare']], left_on=['NAICS', onet_occupation_code_var], right_on=['NAICS', 'OCC_CODE'], how='left')
master_df = master_df.drop(columns=['OCC_CODE'])

# Drop rows with null NAICS -- i.e., unmatched occupations in both procedures
master_df = master_df[~master_df.NAICS.isnull()]

# Save master dataframe to CSV
master_df = master_df.sort_values(by=['NAICS', 'occ_sectorEmpShare'], ascending=[True, False]).reset_index(drop=True)
out_dir = f"{output_data_path}"#/BLS{my_sector}_ONET{my_onet_level}/{dependent_var}/"
os.makedirs(out_dir, exist_ok=True)
master_out = f"{out_dir}/BLS{my_sector}_ONET{my_onet_level}.csv"
master_df.to_csv(master_out, index=False)

In [153]:
my_sector = '3-digit'  # Choose from bls_sector_levels
my_onet_level = 'detailed'  # Choose from onet_levels
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'
dependent_var = 'ai_fraction'

# Create occupation-level
occupation_analysis = create_occupation_analysis(merged_data, onet_occupation_code_var, onet_occupation_title_var)

# Add BLS employment shares for all NAICS sectors and create a master dataset
bls_sector_shares = pd.read_csv(f'{input_data_path}/computed_objects/BLS_ONET_empShares/BLS{my_sector}_ONET{my_onet_level}_empShares.csv')

master_df = occupation_analysis.copy()

# 1) sectorEmpShare
sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'TOT_EMP']].groupby(['NAICS', 'NAICS_TITLE']).sum('TOT_EMP')

# Convert to % and change variable name
sector_weights_df['TOT_EMP'] = sector_weights_df['TOT_EMP'] / sector_weights_df['TOT_EMP'].sum()
sector_weights_df = sector_weights_df.rename(columns={'TOT_EMP': 'sectorEmpShare'})

# Merge back sector weights to bls dataset to get sector-by-sector occupation data with sector weights
bls_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE']].merge(sector_weights_df, on='NAICS', how='left')

# Add weight column to master_df
master_df = master_df.merge(bls_sector_weights_df[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'sectorEmpShare']], left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')
master_df = master_df.drop(columns=['OCC_CODE'])

# 2) occ_sectorEmpShare
within_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP']].copy()

within_sector_weights_df[['NAICS', 'NAICS_TITLE', 'TOT_EMP']].groupby(['NAICS', 'NAICS_TITLE']).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,TOT_EMP
NAICS,NAICS_TITLE,Unnamed: 2_level_1
113000,Forestry and Logging,43350
115000,Support Activities for Agriculture and Forestry,365930
211000,Oil and Gas Extraction,106690
212000,Mining (except Oil and Gas),181550
213000,Support Activities for Mining,274260
221000,Utilities,562210
236000,Construction of Buildings,1775870
237000,Heavy and Civil Engineering Construction,1102640
238000,Specialty Trade Contractors,5026470
311000,Food Manufacturing,1714470


In [148]:
bls_sector_shares[['NAICS', 'NAICS_TITLE']].drop_duplicates()
x = bls_sector_shares[bls_sector_shares[['NAICS', 'NAICS_TITLE']].NAICS_TITLE == 'Hospitals'][['OCC_TITLE', 'occ_sectorEmpShare']]

x.occ_sectorEmpShare.sum()

np.float64(3.9095261543001416)

In [142]:
# 1) Ensure numeric types and fill NaN with 0 (safe for summing)
master_df['num_tasks'] = pd.to_numeric(master_df.get('num_tasks', 0), errors='coerce').fillna(0)
master_df['occ_sectorEmpShare'] = pd.to_numeric(master_df.get('occ_sectorEmpShare', 0), errors='coerce').fillna(0)

# 2) Per-row weighted value (keep this if useful downstream)
master_df['num_task_weighted'] = master_df['num_tasks'] * master_df['occ_sectorEmpShare']
master_df['ai_fraction_weighted'] = master_df['ai_fraction'] * master_df['occ_sectorEmpShare']
master_df['human_E1_fraction_weighted'] = master_df['human_E1_fraction'] * master_df['occ_sectorEmpShare']

# 3) Compute group sums within NAICS / NAICS_TITLE for both columns
sector_sums = (master_df.groupby(['NAICS', 'NAICS_TITLE'], dropna=True)
               .agg(occ_sectorEmpShare_sum=('occ_sectorEmpShare', 'sum'),
                    num_task_weighted_sum=('num_task_weighted', 'sum'),
                    ai_fraction_weighted_sum=('ai_fraction_weighted', 'sum'),
                    human_E1_fraction_weighted_sum=('human_E1_fraction_weighted', 'sum'))
               .reset_index())

# Calculate Hand-off measure: num_task_weighted_sum / occ_sectorEmpShare_sum (Cause occ_sectorEmpShare_sum might be < 1)
sector_sums['num_task_weighted'] = sector_sums['num_task_weighted_sum'] / sector_sums['occ_sectorEmpShare_sum']
sector_sums['ai_fraction_weighted'] = sector_sums['ai_fraction_weighted_sum'] / sector_sums['occ_sectorEmpShare_sum']
sector_sums['human_E1_fraction_weighted'] = sector_sums['human_E1_fraction_weighted_sum'] / sector_sums['occ_sectorEmpShare_sum']


# Optional: sort sectors by total weighted tasks (helpful for inspection)
sector_sums = sector_sums.sort_values(by=['num_task_weighted'], ascending=False).reset_index(drop=True)

# 4) Merge the group sums back into master_df so each row has the sector totals
master_df = master_df.merge(sector_sums, on=['NAICS', 'NAICS_TITLE'], how='left')

# 5) Quick checks: display the sector-level sums and the head of master_df
sector_sums
# master_df.head()

Unnamed: 0,NAICS,NAICS_TITLE,occ_sectorEmpShare_sum,num_task_weighted_sum,ai_fraction_weighted_sum,human_E1_fraction_weighted_sum,num_task_weighted,ai_fraction_weighted,human_E1_fraction_weighted
0,622000.0,Hospitals,3.72,229.26,0.41,0.49,61.66,0.11,0.13
1,334000.0,Computer and Electronic Product Manufacturing,0.73,28.43,0.11,0.12,39.21,0.15,0.17
2,516000.0,Broadcasting and Content Providers,0.87,32.81,0.16,0.23,37.91,0.19,0.26
3,999000.0,"Federal, State, and Local Government, excludin...",0.92,33.09,0.11,0.14,36.01,0.12,0.16
4,519000.0,"Web Search Portals, Libraries, Archives, and O...",0.66,23.2,0.15,0.16,35.23,0.23,0.24
5,336000.0,Transportation Equipment Manufacturing,0.71,24.65,0.05,0.07,34.82,0.08,0.1
6,512000.0,Motion Picture and Sound Recording Industries,0.91,31.42,0.14,0.16,34.64,0.16,0.17
7,551000.0,Management of Companies and Enterprises,0.87,30.06,0.17,0.18,34.51,0.2,0.21
8,621000.0,Ambulatory Health Care Services,0.81,27.84,0.11,0.12,34.42,0.14,0.15
9,623000.0,Nursing and Residential Care Facilities,0.75,25.9,0.06,0.07,34.35,0.07,0.09


In [143]:
# Regression: regress 'ai_fraction_weighted' on 'num_task_weighted'
# controlling for 'human_E1_fraction_weighted' using sector_sums (one row per sector)
import statsmodels.api as sm

# Select relevant columns and drop missing values
reg_df = sector_sums[[
    'ai_fraction_weighted',
    'num_task_weighted',
    'human_E1_fraction_weighted'
]].dropna()

print(f'Observations available for regression: {len(reg_df)}')

if len(reg_df) < 3:
    print('Not enough observations to run a reliable regression (need at least 3 rows after dropping NA).')
else:
    # Prepare design matrix
    X = reg_df[['num_task_weighted', 'human_E1_fraction_weighted']]
    X = sm.add_constant(X)
    y = reg_df['ai_fraction_weighted']

    # Fit OLS model
    model = sm.OLS(y, X).fit()
    print('\nOLS results:')
    print(model.summary())

    robust = model.get_robustcov_results(cov_type='HC3')    # Also show robust standard errors (HC3)
    print('OLS results with HC3 robust SEs:')
    print(robust.summary())

Observations available for regression: 85

OLS results:
                             OLS Regression Results                             
Dep. Variable:     ai_fraction_weighted   R-squared:                       0.827
Model:                              OLS   Adj. R-squared:                  0.823
Method:                   Least Squares   F-statistic:                     195.9
Date:                  Wed, 05 Nov 2025   Prob (F-statistic):           5.85e-32
Time:                          09:45:38   Log-Likelihood:                 189.20
No. Observations:                    85   AIC:                            -372.4
Df Residuals:                        82   BIC:                            -365.1
Df Model:                             2                                         
Covariance Type:              nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------