#### By: Peyman Shahidi
#### Created: Nov 7, 2025
#### Last Edit: Nov 10, 2025

<br>

In [41]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [42]:
FREQUENT_TASKS = False  # Whether to use only frequent tasks or all tasks

In [43]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects/fragmentationIndex_frequent' if FREQUENT_TASKS else f'{input_data_path}/computed_objects/fragmentationIndex'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [44]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [45]:
def create_occupation_analysis(df, onet_occupation_code_var, onet_occupation_title_var):
    # Create occupation-level analysis for scatter plots
    # Group by occupation and calculate label fractions and task counts
    occupation_stats = []

    for (soc_code, occ_title), group in df.groupby([onet_occupation_code_var, onet_occupation_title_var]):
        num_tasks = group['Task ID'].nunique()
        # num_occupations = group[onet_occupation_code_var].nunique()
        total_tasks = len(group)
        
        manual_fraction = (group['label'] == 'Manual').sum() / total_tasks
        augmentation_fraction = (group['label'] == 'Augmentation').sum() / total_tasks  
        automation_fraction = (group['label'] == 'Automation').sum() / total_tasks
        ai_fraction = augmentation_fraction + automation_fraction
        gpt4_E0_fraction = (group['gpt4_exposure'] == 'E0').sum() / total_tasks
        gpt4_E1_fraction = (group['gpt4_exposure'] == 'E1').sum() / total_tasks
        gpt4_E2_fraction = (group['gpt4_exposure'] == 'E2').sum() / total_tasks
        gpt4_aiExposure_fraction = gpt4_E1_fraction + gpt4_E2_fraction
        human_E0_fraction = (group['human_labels'] == 'E0').sum() / total_tasks
        human_E1_fraction = (group['human_labels'] == 'E1').sum() / total_tasks
        human_E2_fraction = (group['human_labels'] == 'E2').sum() / total_tasks
        human_aiExposure_fraction = human_E1_fraction + human_E2_fraction

        
        occupation_stats.append({
            f'{onet_occupation_code_var}': soc_code,
            f'{onet_occupation_title_var}': occ_title,
            'num_tasks': num_tasks,
            # 'num_occupations': num_occupations,
            'manual_fraction': manual_fraction,
            'ai_fraction': ai_fraction,
            'augmentation_fraction': augmentation_fraction,
            'automation_fraction': automation_fraction,
            'gpt4_E0_fraction': gpt4_E0_fraction,
            'gpt4_E1_fraction': gpt4_E1_fraction,
            'gpt4_E2_fraction': gpt4_E2_fraction,
            'gpt4_aiExposure_fraction': gpt4_aiExposure_fraction,
            'human_E0_fraction': human_E0_fraction,
            'human_E1_fraction': human_E1_fraction,
            'human_E2_fraction': human_E2_fraction,
            'human_aiExposure_fraction': human_aiExposure_fraction
        })

    occupation_analysis = pd.DataFrame(occupation_stats)

    return occupation_analysis

In [46]:
# Read the merged data
if not FREQUENT_TASKS:
    input_file_path = f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv" 
    merged_data = pd.read_csv(input_file_path)

        # Remove occupations with three or less frequent tasks
    frequent_tasks_per_occupation_threshold = 3
    occupation_task_counts = merged_data.groupby('O*NET-SOC Code')['Task ID'].nunique()
    valid_occupations = occupation_task_counts[occupation_task_counts >= frequent_tasks_per_occupation_threshold].index
    merged_data = merged_data[merged_data['O*NET-SOC Code'].isin(valid_occupations)].reset_index(drop=True)
else:
    input_file_path = f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT_frequent/ONET_Eloundou_Anthropic_GPT.csv"
    merged_data = pd.read_csv(input_file_path)

In [47]:
# Define levels and variables
onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']
dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']

## Calculate Fragmentation Index treating all AI tasks similarly and focusing on consecutive placements of AI tasks

In [48]:
# Calculate fragmentation index
# For the sake of this exercise we treat all AI tasks as a single category

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_ai column
fi_df = merged_data.copy()
fi_df['is_ai'] = fi_df['label'].isin(['Augmentation', 'Automation']).astype(int)


# Create next_is_ai column within occupation groups
fi_df['next_is_ai'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_ai'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
fi_df['fi_counter'] = 1
fi_df.loc[(fi_df['is_ai'] == 1) & (fi_df['next_is_ai'] == 1), 'fi_counter'] = 0

fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'fi_counter': 'fragmentation_index'})

# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex.csv", index=False)
display(fi_df.head(10))

# Display FI for certain occupations
display_list = ['15-1251.00', '27-3042.00', '27-3043.00']  # Computer Programmers, Technical Writers, Writers and Authors
display(fi_df[fi_df['O*NET-SOC Code'].isin(display_list)])
print(f'Average Fragmentation Index: {fi_df["fragmentation_index"].mean():.4f}')

Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index
0,11-1011.00,Chief Executives,0.97
1,11-1011.03,Chief Sustainability Officers,0.94
2,11-1021.00,General and Operations Managers,1.0
3,11-2011.00,Advertising and Promotions Managers,1.0
4,11-2021.00,Marketing Managers,1.0
5,11-2022.00,Sales Managers,1.0
6,11-3012.00,Administrative Services Managers,1.0
7,11-3021.00,Computer and Information Systems Managers,1.0
8,11-3031.00,Financial Managers,0.9
9,11-3031.01,Treasurers and Controllers,1.0


Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index
105,15-1251.00,Computer Programmers,0.71
339,27-3042.00,Technical Writers,0.8
340,27-3043.00,Writers and Authors,0.5


Average Fragmentation Index: 0.9610


In [49]:
# Merge fragmentation index with occupation analysis
occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

# Save occupation analysis with fragmentation index
occupation_analysis.to_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex.csv", index=False)

In [50]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [51]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

ai_exposure_var = 'human_E1_fraction'

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    ai_exposure_var: 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

# Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['fragmentation_index', ai_exposure_var, f'fragmentation_index:{ai_exposure_var}', 'num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index * {ai_exposure_var} + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",
    cov_kwds={
        "groups": occupation_analysis_aggregated[onet_occupation_code_var],
        "use_correction": True,
        "df_correction": True
    })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index * {ai_exposure_var} + C(Major_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index * {ai_exposure_var} + C(Minor_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                                       coef   se      t    p  ci_low  ci_high
fragmentation_index                   -1.87 0.07 -26.93 0.00   -2.01    -1.73
human_E1_fraction                     -1.23 0.26  -4.80 0.00   -1.73    -0.73
fragmentation_index:human_E1_fraction  1.54 0.27   5.78 0.00    1.02     2.06
num_tasks                              0.00 0.00   1.28 0.20   -0.00     0.00


=== Model B (Major FE) — selected coefficients ===
                                       coef   se      t    p  ci_low  ci_high
fragmentation_index                   -1.50 0.07 -20.52 0.00   -1.65    -1.36
human_E1_fraction                     -0.63 0.23  -2.69 0.01   -1.09    -0.17
fragmentation_index:human_E1_fraction  0.80 0.25   3.20 0.00    0.31     1.29
num_tasks                             -0.00 0.00  -0.15 0.88   -0.00     0.00


=== Model C (Minor FE) — selected coefficients ===
                                       coef   se      t    p  ci_low  ci

### Redefine Fragmentation Index treating Chains as Run of Consecutive Automated Tasks terminated by an Augmented Task (Model Definition)

In [52]:
# Calculate fragmentation index
# For the sake of this exercise we define AI chains as a number of Automated tasks terminated by an Augmented Task

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_automated column
fi_df = merged_data.copy()
fi_df['is_automated'] = fi_df['label'].isin(['Automation']).astype(int)
fi_df['is_augmented'] = fi_df['label'].isin(['Augmentation']).astype(int)


# Create next_is_automated column within occupation groups
fi_df['next_is_automated'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_automated'].shift(-1).fillna(0).astype(int)
fi_df['next_is_augmented'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_augmented'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
# Determine if task is part of an AI chain
fi_df['fi_counter'] = 1
ai_chain_indicator = (fi_df['is_automated'] == 1) & ((fi_df['next_is_automated'] == 1) | (fi_df['next_is_augmented'] == 1))
fi_df.loc[ai_chain_indicator, 'fi_counter'] = 0

# Display fragmentation index for computer programmers
display(fi_df[['O*NET-SOC Code', 'Occupation Title', 'Task Position', 'label', 
               'is_automated', 'next_is_automated', 'next_is_augmented', 'fi_counter']][fi_df['O*NET-SOC Code']=='15-1251.00'])


fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'fi_counter': 'fragmentation_index'})

# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex_modelDefinition.csv", index=False)

# Display FI for certain occupations
display_list = ['15-1251.00', '27-3042.00', '27-3043.00']  # Computer Programmers, Technical Writers, Writers and Authors
display(fi_df[fi_df['O*NET-SOC Code'].isin(display_list)])
print(f'Average Fragmentation Index: {fi_df["fragmentation_index"].mean():.4f}')

Unnamed: 0,O*NET-SOC Code,Occupation Title,Task Position,label,is_automated,next_is_automated,next_is_augmented,fi_counter
2168,15-1251.00,Computer Programmers,1,Augmentation,0,0,0,1
2169,15-1251.00,Computer Programmers,2,Manual,0,1,0,1
2170,15-1251.00,Computer Programmers,3,Automation,1,0,1,0
2171,15-1251.00,Computer Programmers,4,Augmentation,0,1,0,1
2172,15-1251.00,Computer Programmers,5,Automation,1,0,0,1
2173,15-1251.00,Computer Programmers,6,Manual,0,0,1,1
2174,15-1251.00,Computer Programmers,7,Augmentation,0,0,0,1
2175,15-1251.00,Computer Programmers,8,Manual,0,0,1,1
2176,15-1251.00,Computer Programmers,9,Augmentation,0,1,0,1
2177,15-1251.00,Computer Programmers,10,Automation,1,1,0,0


Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index
105,15-1251.00,Computer Programmers,0.82
339,27-3042.00,Technical Writers,0.93
340,27-3043.00,Writers and Authors,1.0


Average Fragmentation Index: 0.9874


In [53]:
# Merge fragmentation index with occupation analysis
occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

# Save occupation analysis with fragmentation index
occupation_analysis.to_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex_modelDefinition.csv", index=False)

In [54]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [55]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

ai_exposure_var = 'human_E1_fraction'

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    ai_exposure_var: 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

# Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['fragmentation_index', ai_exposure_var, f'fragmentation_index:{ai_exposure_var}', 'num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index * {ai_exposure_var} + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index * {ai_exposure_var} + C(Major_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index * {ai_exposure_var} + C(Minor_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                                       coef   se      t    p  ci_low  ci_high
fragmentation_index                   -3.05 0.28 -11.07 0.00   -3.59    -2.51
human_E1_fraction                     -1.38 0.96  -1.43 0.15   -3.27     0.51
fragmentation_index:human_E1_fraction  1.83 0.97   1.88 0.06   -0.08     3.73
num_tasks                             -0.00 0.00  -0.18 0.86   -0.00     0.00


=== Model B (Major FE) — selected coefficients ===
                                       coef   se     t    p  ci_low  ci_high
fragmentation_index                   -2.11 0.25 -8.50 0.00   -2.59    -1.62
human_E1_fraction                     -0.37 0.82 -0.45 0.65   -1.97     1.23
fragmentation_index:human_E1_fraction  0.58 0.83  0.70 0.48   -1.04     2.20
num_tasks                             -0.00 0.00 -1.58 0.11   -0.00     0.00


=== Model C (Minor FE) — selected coefficients ===
                                       coef   se     t    p  ci_low  ci_high


### Create a fragmentation measure using AI Exposure instead of AI execution (treating just E1 tasks as exposed)

In [56]:
# Calculate fragmentation index
# For the sake of this exercise we treat all AI tasks as a single category

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_ai column
fi_df = merged_data.copy()
fi_df['is_ai_exposed'] = fi_df['human_labels'].isin(['E1']).astype(int)


# Create next_is_ai column within occupation groups
fi_df['next_is_ai_exposed'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_ai_exposed'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
fi_df['exposure_fi_counter'] = 1
fi_df.loc[(fi_df['is_ai_exposed'] == 1) & (fi_df['next_is_ai_exposed'] == 1), 'exposure_fi_counter'] = 0

fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['exposure_fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'exposure_fi_counter': 'exposure_fragmentation_index'})

# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex_exposure.csv", index=False)
display(fi_df.head(10))

# Display FI for certain occupations
display_list = ['15-1251.00', '27-3042.00', '27-3043.00']  # Computer Programmers, Technical Writers, Writers and Authors
display(fi_df[fi_df['O*NET-SOC Code'].isin(display_list)])
print(f'Average Fragmentation Index: {fi_df["exposure_fragmentation_index"].mean():.4f}')

Unnamed: 0,O*NET-SOC Code,Occupation Title,exposure_fragmentation_index
0,11-1011.00,Chief Executives,0.94
1,11-1011.03,Chief Sustainability Officers,1.0
2,11-1021.00,General and Operations Managers,1.0
3,11-2011.00,Advertising and Promotions Managers,0.95
4,11-2021.00,Marketing Managers,0.9
5,11-2022.00,Sales Managers,1.0
6,11-3012.00,Administrative Services Managers,1.0
7,11-3021.00,Computer and Information Systems Managers,0.94
8,11-3031.00,Financial Managers,1.0
9,11-3031.01,Treasurers and Controllers,1.0


Unnamed: 0,O*NET-SOC Code,Occupation Title,exposure_fragmentation_index
105,15-1251.00,Computer Programmers,0.82
339,27-3042.00,Technical Writers,0.93
340,27-3043.00,Writers and Authors,0.75


Average Fragmentation Index: 0.9617


In [57]:
# Merge fragmentation index with occupation analysis
occupation_analysis = pd.read_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex.csv")

occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [58]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [59]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    'exposure_fragmentation_index': 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

#Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['exposure_fragmentation_index', 'num_tasks', 'exposure_fragmentation_index:num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index * num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index * num_tasks + C(Major_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index * num_tasks + C(Minor_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                                        coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index            0.26 0.30  0.87 0.38   -0.33     0.86
num_tasks                              -0.01 0.01 -0.98 0.33   -0.04     0.01
exposure_fragmentation_index:num_tasks  0.01 0.01  0.95 0.34   -0.01     0.04


=== Model B (Major FE) — selected coefficients ===
                                        coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index            0.53 0.28  1.88 0.06   -0.02     1.09
num_tasks                               0.02 0.01  1.33 0.18   -0.01     0.04
exposure_fragmentation_index:num_tasks -0.02 0.01 -1.26 0.21   -0.04     0.01


=== Model C (Minor FE) — selected coefficients ===
                                        coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index            0.64 0.27  2.37 0.02    0.11     1.16
num_tasks                               0.03 0.01  2.26 0.02    0.00    

In [60]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    'exposure_fragmentation_index': 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

#Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['exposure_fragmentation_index', 'num_tasks', 'exposure_fragmentation_index:num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index * num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index * num_tasks + C(Major_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index * num_tasks + C(Minor_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                                        coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index           -0.18 0.36 -0.50 0.62   -0.89     0.52
num_tasks                               0.05 0.02  2.86 0.00    0.02     0.08
exposure_fragmentation_index:num_tasks -0.05 0.02 -2.79 0.01   -0.08    -0.01


=== Model B (Major FE) — selected coefficients ===
                                        coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index           -0.64 0.29 -2.19 0.03   -1.21    -0.07
num_tasks                              -0.01 0.01 -0.88 0.38   -0.04     0.01
exposure_fragmentation_index:num_tasks  0.01 0.01  0.79 0.43   -0.02     0.04


=== Model C (Minor FE) — selected coefficients ===
                                        coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index           -0.71 0.28 -2.58 0.01   -1.25    -0.17
num_tasks                              -0.03 0.01 -1.87 0.06   -0.05    