#### By: Peyman Shahidi
#### Created: Nov 9, 2025
#### Last Edit: Nov 9, 2025

<br>

In [129]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [130]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
# Modify the output path accordingly
output_data_path = f'{input_data_path}/computed_objects/BLS_ONET_matchedEmpShares'
output_plot_path = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares"

# Toggle: if True, randomly reassign occ_totalEmpShare weights in the merged master_df
# during the merge_industry_employment_shares step. Set to False for default behavior.
randomize_occ_weights = True


In [131]:
import os
for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

#### Main Code

In [143]:
technology_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Technology Skills.txt', sep='\t')
tools_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Tools Used.txt', sep='\t')

# Count number of unique "Example", "Commodity Code", and "Commodity Title" per O*NET-SOC Code
num_technologies_per_occ = technology_df.groupby('O*NET-SOC Code').agg({
    'Example': 'nunique',
    'Commodity Code': 'nunique'
}).reset_index()
num_technologies_per_occ.columns = ['O*NET-SOC Code', 'num_examples', 'num_technologies']

num_tools_per_occ = tools_df.groupby('O*NET-SOC Code').agg({
    'Example': 'nunique',
    'Commodity Code': 'nunique'
}).reset_index()
num_tools_per_occ.columns = ['O*NET-SOC Code', 'num_examples', 'num_tools']


display(num_technologies_per_occ)
display(num_tools_per_occ)
# Should work with "Commodity Code" and "Commodity Title"


Unnamed: 0,O*NET-SOC Code,num_examples,num_technologies
0,11-1011.00,49,24
1,11-1011.03,21,17
2,11-1021.00,146,56
3,11-1031.00,32,17
4,11-2011.00,72,29
...,...,...,...
918,53-7071.00,6,6
919,53-7072.00,7,7
920,53-7073.00,7,6
921,53-7081.00,10,9


Unnamed: 0,O*NET-SOC Code,num_examples,num_tools
0,11-1011.00,7,7
1,11-1011.03,10,10
2,11-1021.00,18,17
3,11-1031.00,4,3
4,11-2011.00,12,10
...,...,...,...
897,53-7071.00,17,13
898,53-7072.00,40,35
899,53-7073.00,69,56
900,53-7081.00,16,7


In [133]:
# Read occupation-level data
occupation_df = pd.read_csv(f'{input_data_path}/computed_objects/fragmentationIndex/occupation_analysis_with_fragmentationIndex.csv')
occupation_df.head()

Unnamed: 0,O*NET-SOC Code,Occupation Title,num_tasks,manual_fraction,ai_fraction,augmentation_fraction,automation_fraction,gpt4_E0_fraction,gpt4_E1_fraction,gpt4_E2_fraction,gpt4_aiExposure_fraction,human_E0_fraction,human_E1_fraction,human_E2_fraction,human_aiExposure_fraction,fragmentation_index
0,11-1011.00,Chief Executives,31,0.74,0.26,0.26,0.0,0.19,0.1,0.68,0.77,0.45,0.16,0.35,0.52,0.97
1,11-1011.03,Chief Sustainability Officers,18,0.72,0.28,0.28,0.0,0.06,0.17,0.78,0.94,0.28,0.06,0.67,0.72,0.94
2,11-1021.00,General and Operations Managers,17,0.94,0.06,0.06,0.0,0.06,0.0,0.94,0.94,0.35,0.12,0.53,0.65,1.0
3,11-2011.00,Advertising and Promotions Managers,21,0.86,0.14,0.14,0.0,0.05,0.0,0.95,0.95,0.19,0.29,0.52,0.81,1.0
4,11-2021.00,Marketing Managers,20,0.7,0.3,0.3,0.0,0.05,0.05,0.9,0.95,0.05,0.25,0.7,0.95,1.0


In [145]:
# Merge the two dataframes on O*NET-SOC Code
handoff_df = pd.merge(occupation_df, num_technologies_per_occ[['O*NET-SOC Code', 'num_technologies']], on='O*NET-SOC Code', how='left')
handoff_df = pd.merge(handoff_df, num_tools_per_occ[['O*NET-SOC Code', 'num_tools']], on='O*NET-SOC Code', how='left')

handoff_df['num_technologies_per_task'] = handoff_df['num_technologies'] / handoff_df['num_tasks']
handoff_df['num_tools_per_task'] = handoff_df['num_tools'] / handoff_df['num_tasks']

# Keep only relevant columns
handoff_df = handoff_df[['O*NET-SOC Code', 'Occupation Title', 'num_tasks', 'num_technologies', 'num_technologies_per_task', 'num_tools', 'num_tools_per_task']]

# Assign rankings based on each of the "num_tasks", "num_technologies", "num_technologies_per_task" columns
for col in ['num_tasks', 'num_technologies', 'num_technologies_per_task']:
    rank_col = f'{col}_rank'
    handoff_df[rank_col] = handoff_df[col].rank(ascending=False, method='min').astype(int)

handoff_df.sort_values(by='num_technologies_per_task_rank', ascending=False)

Unnamed: 0,O*NET-SOC Code,Occupation Title,num_tasks,num_technologies,num_technologies_per_task,num_tools,num_tools_per_task,num_tasks_rank,num_technologies_rank,num_technologies_per_task_rank
706,49-9063.00,Musical Instrument Repairers and Tuners,37,1,0.03,149,4.03,11,871,872
734,51-3093.00,Food Cooking Machine Operators and Tenders,17,1,0.06,14,0.82,565,871,871
377,29-1151.00,Nurse Anesthetists,24,2,0.08,42,1.75,237,858,870
732,51-3091.00,"Food and Tobacco Roasting, Baking, and Drying ...",20,2,0.10,28,1.40,420,858,868
795,51-9022.00,"Grinding and Polishing Workers, Hand",20,2,0.10,19,0.95,420,858,868
...,...,...,...,...,...,...,...,...,...,...
6,11-3012.00,Administrative Services Managers,8,30,3.75,11,1.38,857,45,5
98,15-1232.00,Computer User Support Specialists,16,73,4.56,14,0.88,612,1,4
101,15-1242.00,Database Administrators,14,64,4.57,6,0.43,718,7,3
95,15-1212.00,Information Security Analysts,12,58,4.83,5,0.42,802,9,2


In [135]:
# # Drop the supplemental tasks
# merged_data = merged_data[merged_data['Task Type'] != 'Supplemental'].reset_index(drop=True)

# # Drop rows whose Occupation Title includes 'Teachers, Postsecondary'
# merged_data = merged_data[~merged_data['Occupation Title'].str.contains('Teachers, Postsecondary')].reset_index(drop=True)

In [136]:
# Define levels and variables
bls_sector_levels = ['sector']#, '3-digit', '4-digit', '5-digit', '6-digit']

onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']

weight_cols = ['occ_totalEmpShare',         # Weight each occupation by occupation's share of total employment (ignoring sector shares)
               'sectorEmpShare',            # Weight each occupation by its sector's share of total employment
               'occ_sectorEmpShare']        # Weight each occupation by its share of employment within its sector and weight sectors equally  

dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']


In [137]:
my_sector = '5-digit'  # Choose from bls_sector_levels
my_onet_level = 'detailed'  # Choose from onet_levels
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

In [138]:
handoff_var = 'num_tasks'
ai_outcome_var = 'ai_fraction'
ai_exposure_var = 'human_E1_fraction'
fragmentation_index_var = 'fragmentation_index'

# Create occupation-level data
occupation_analysis = pd.read_csv(f"{input_data_path}/computed_objects/fragmentationIndex/occupation_analysis_with_fragmentationIndex.csv")

# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

# Aggregate to onet_occupation_code_var level
occupation_analysis = occupation_analysis.groupby([onet_occupation_code_var, onet_occupation_title_var]).agg({
    handoff_var: 'mean',
    ai_outcome_var: 'mean',
    ai_exposure_var: 'mean',
    fragmentation_index_var: 'mean',
}).reset_index()

occupation_analysis

Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,num_tasks,ai_fraction,human_E1_fraction,fragmentation_index
0,11-1011,Chief Executives,24.50,0.27,0.11,0.96
1,11-1021,General and Operations Managers,17.00,0.06,0.12,1.00
2,11-2011,Advertising and Promotions Managers,21.00,0.14,0.29,1.00
3,11-2021,Marketing Managers,20.00,0.30,0.25,1.00
4,11-2022,Sales Managers,17.00,0.18,0.12,1.00
...,...,...,...,...,...,...
753,53-7071,Gas Compressor and Gas Pumping Station Operators,13.00,0.00,0.00,1.00
754,53-7072,"Pump Operators, Except Wellhead Pumpers",14.00,0.00,0.00,1.00
755,53-7073,Wellhead Pumpers,16.00,0.00,0.00,1.00
756,53-7081,Refuse and Recyclable Material Collectors,16.00,0.00,0.12,1.00
