#### By: Peyman Shahidi
#### Created: Nov 7, 2025
#### Last Edit: Nov 7, 2025

<br>

In [182]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [183]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects/fragmentationIndex_frequent'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [184]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [185]:
def create_occupation_analysis(df, onet_occupation_code_var, onet_occupation_title_var):
    # Create occupation-level analysis for scatter plots
    # Group by occupation and calculate label fractions and task counts
    occupation_stats = []

    for (soc_code, occ_title), group in df.groupby([onet_occupation_code_var, onet_occupation_title_var]):
        num_tasks = group['Task ID'].nunique()
        # num_occupations = group[onet_occupation_code_var].nunique()
        total_tasks = len(group)
        
        manual_fraction = (group['label'] == 'Manual').sum() / total_tasks
        augmentation_fraction = (group['label'] == 'Augmentation').sum() / total_tasks  
        automation_fraction = (group['label'] == 'Automation').sum() / total_tasks
        ai_fraction = augmentation_fraction + automation_fraction
        gpt4_E0_fraction = (group['gpt4_exposure'] == 'E0').sum() / total_tasks
        gpt4_E1_fraction = (group['gpt4_exposure'] == 'E1').sum() / total_tasks
        gpt4_E2_fraction = (group['gpt4_exposure'] == 'E2').sum() / total_tasks
        gpt4_aiExposure_fraction = gpt4_E1_fraction + gpt4_E2_fraction
        human_E0_fraction = (group['human_labels'] == 'E0').sum() / total_tasks
        human_E1_fraction = (group['human_labels'] == 'E1').sum() / total_tasks
        human_E2_fraction = (group['human_labels'] == 'E2').sum() / total_tasks
        human_aiExposure_fraction = human_E1_fraction + human_E2_fraction

        
        occupation_stats.append({
            f'{onet_occupation_code_var}': soc_code,
            f'{onet_occupation_title_var}': occ_title,
            'num_tasks': num_tasks,
            # 'num_occupations': num_occupations,
            'manual_fraction': manual_fraction,
            'ai_fraction': ai_fraction,
            'augmentation_fraction': augmentation_fraction,
            'automation_fraction': automation_fraction,
            'gpt4_E0_fraction': gpt4_E0_fraction,
            'gpt4_E1_fraction': gpt4_E1_fraction,
            'gpt4_E2_fraction': gpt4_E2_fraction,
            'gpt4_aiExposure_fraction': gpt4_aiExposure_fraction,
            'human_E0_fraction': human_E0_fraction,
            'human_E1_fraction': human_E1_fraction,
            'human_E2_fraction': human_E2_fraction,
            'human_aiExposure_fraction': human_aiExposure_fraction
        })

    occupation_analysis = pd.DataFrame(occupation_stats)

    return occupation_analysis

In [186]:
# Read the merged data
# merged_data = pd.read_csv(f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv")


merged_data = pd.read_csv(f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT_frequent/ONET_Eloundou_Anthropic_GPT.csv")

# Remove occupations with three or less frequent tasks
frequent_tasks_per_occupation_threshold = 3
occupation_task_counts = merged_data.groupby('O*NET-SOC Code')['Task ID'].nunique()
valid_occupations = occupation_task_counts[occupation_task_counts >= frequent_tasks_per_occupation_threshold].index
merged_data = merged_data[merged_data['O*NET-SOC Code'].isin(valid_occupations)].reset_index(drop=True)

In [187]:
# # Drop the supplemental tasks
# merged_data = merged_data[merged_data['Task Type'] != 'Supplemental'].reset_index(drop=True)

# # Drop rows whose Occupation Title includes 'Teachers, Postsecondary'
# merged_data = merged_data[~merged_data['Occupation Title'].str.contains('Teachers, Postsecondary')].reset_index(drop=True)

In [188]:
# Define levels and variables
onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']
dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']

## Calculate Fragmentation Index treating all AI tasks similarly and focusing on consecutive placements of AI tasks

In [189]:
# Calculate fragmentation index
# For the sake of this exercise we treat all AI tasks as a single category

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_ai column
fi_df = merged_data.copy()
fi_df['is_ai'] = fi_df['label'].isin(['Augmentation', 'Automation']).astype(int)


# Create next_is_ai column within occupation groups
fi_df['next_is_ai'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_ai'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
fi_df['fi_counter'] = 1
fi_df.loc[(fi_df['is_ai'] == 1) & (fi_df['next_is_ai'] == 1), 'fi_counter'] = 0

fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'fi_counter': 'fragmentation_index'})

# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex.csv", index=False)
display(fi_df.head(10))
fi_df[fi_df['O*NET-SOC Code']=='15-1251.00']

Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index
0,11-3031.01,Treasurers and Controllers,1.0
1,11-3031.03,Investment Fund Managers,1.0
2,11-3051.02,Geothermal Production Managers,1.0
3,11-3051.03,Biofuels Production Managers,1.0
4,11-3051.04,Biomass Power Plant Managers,1.0
5,11-3061.00,Purchasing Managers,1.0
6,11-3071.00,"Transportation, Storage, and Distribution Mana...",1.0
7,11-9051.00,Food Service Managers,1.0
8,11-9071.00,Gambling Managers,1.0
9,11-9081.00,Lodging Managers,1.0


Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index


In [190]:
# Merge fragmentation index with occupation analysis
occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

# Save occupation analysis with fragmentation index
occupation_analysis.to_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex.csv", index=False)

In [191]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [192]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

ai_exposure_var = 'human_E1_fraction'

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    ai_exposure_var: 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

# Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['fragmentation_index', ai_exposure_var, 'num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",
    cov_kwds={
        "groups": occupation_analysis_aggregated[onet_occupation_code_var],
        "use_correction": True,
        "df_correction": True
    })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Major_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Minor_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -1.71 0.09 -19.60 0.00   -1.88    -1.54
human_E1_fraction    0.24 0.04   6.39 0.00    0.16     0.31
num_tasks           -0.00 0.00  -3.49 0.00   -0.00    -0.00


=== Model B (Major FE) — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -1.52 0.08 -18.51 0.00   -1.68    -1.36
human_E1_fraction    0.19 0.04   4.65 0.00    0.11     0.28
num_tasks           -0.00 0.00  -3.50 0.00   -0.00    -0.00


=== Model C (Minor FE) — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -1.42 0.09 -15.77 0.00   -1.60    -1.24
human_E1_fraction    0.26 0.05   5.43 0.00    0.16     0.35
num_tasks           -0.00 0.00  -1.88 0.06   -0.00     0.00


### Redefine Fragmentation Index treating Chains as Run of Consecutive Automated Tasks terminated by an Augmented Task

In [193]:
# Calculate fragmentation index
# For the sake of this exercise we define AI chains as a number of Automated tasks terminated by an Augmented Task

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_automated column
fi_df = merged_data.copy()
fi_df['is_automated'] = fi_df['label'].isin(['Automation']).astype(int)
fi_df['is_augmented'] = fi_df['label'].isin(['Augmentation']).astype(int)


# Create next_is_automated column within occupation groups
fi_df['next_is_automated'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_automated'].shift(-1).fillna(0).astype(int)
fi_df['next_is_augmented'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_augmented'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
fi_df['fi_counter'] = 1
fi_df.loc[(fi_df['is_automated'] == 1) & ((fi_df['next_is_automated'] == 1) | (fi_df['next_is_augmented'] == 1)), 'fi_counter'] = 0

# Display fragmentation index for computer programmers
display(fi_df[['O*NET-SOC Code', 'Occupation Title', 'Task Position', 'label', 
               'is_automated', 'next_is_automated', 'next_is_augmented', 'fi_counter']][fi_df['O*NET-SOC Code']=='15-1251.00'])


fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'fi_counter': 'fragmentation_index'})

# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex_modelDefinition.csv", index=False)
fi_df[fi_df['O*NET-SOC Code']=='15-1251.00']

Unnamed: 0,O*NET-SOC Code,Occupation Title,Task Position,label,is_automated,next_is_automated,next_is_augmented,fi_counter


Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index


In [194]:
# Merge fragmentation index with occupation analysis
occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

# Save occupation analysis with fragmentation index
occupation_analysis.to_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex_modelDefinition.csv", index=False)

In [195]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [196]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

ai_exposure_var = 'human_E1_fraction'

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    ai_exposure_var: 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

# Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['fragmentation_index', ai_exposure_var, 'num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Major_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Minor_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -1.74 0.16 -10.85 0.00   -2.05    -1.42
human_E1_fraction    0.33 0.05   6.85 0.00    0.24     0.43
num_tasks           -0.00 0.00  -3.37 0.00   -0.01    -0.00


=== Model B (Major FE) — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -1.48 0.14 -10.26 0.00   -1.77    -1.20
human_E1_fraction    0.24 0.05   4.48 0.00    0.14     0.35
num_tasks           -0.00 0.00  -3.67 0.00   -0.01    -0.00


=== Model C (Minor FE) — selected coefficients ===
                     coef   se     t    p  ci_low  ci_high
fragmentation_index -1.37 0.21 -6.65 0.00   -1.78    -0.97
human_E1_fraction    0.33 0.06  5.24 0.00    0.21     0.45
num_tasks           -0.00 0.00 -2.37 0.02   -0.00    -0.00


### Create a fragmentation measure using AI Exposure instead of AI execution (treating just E1 tasks as exposed)

In [197]:
# Calculate fragmentation index
# For the sake of this exercise we treat all AI tasks as a single category

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_ai column
fi_df = merged_data.copy()
fi_df['is_ai_exposed'] = fi_df['human_labels'].isin(['E1']).astype(int)


# Create next_is_ai column within occupation groups
fi_df['next_is_ai_exposed'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_ai_exposed'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
fi_df['exposure_fi_counter'] = 1
fi_df.loc[(fi_df['is_ai_exposed'] == 1) & (fi_df['next_is_ai_exposed'] == 1), 'exposure_fi_counter'] = 0

fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['exposure_fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'exposure_fi_counter': 'exposure_fragmentation_index'})

# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex_exposure.csv", index=False)
display(fi_df.head(10))
fi_df[fi_df['O*NET-SOC Code']=='15-1251.00']

Unnamed: 0,O*NET-SOC Code,Occupation Title,exposure_fragmentation_index
0,11-3031.01,Treasurers and Controllers,1.0
1,11-3031.03,Investment Fund Managers,1.0
2,11-3051.02,Geothermal Production Managers,1.0
3,11-3051.03,Biofuels Production Managers,1.0
4,11-3051.04,Biomass Power Plant Managers,1.0
5,11-3061.00,Purchasing Managers,1.0
6,11-3071.00,"Transportation, Storage, and Distribution Mana...",1.0
7,11-9051.00,Food Service Managers,1.0
8,11-9071.00,Gambling Managers,0.86
9,11-9081.00,Lodging Managers,1.0


Unnamed: 0,O*NET-SOC Code,Occupation Title,exposure_fragmentation_index


In [198]:
# Merge fragmentation index with occupation analysis
occupation_analysis = pd.read_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex.csv")

occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [199]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [200]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    'exposure_fragmentation_index': 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

#Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['exposure_fragmentation_index', 'num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index + num_tasks + C(Major_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index + num_tasks + C(Minor_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                              coef   se    t    p  ci_low  ci_high
exposure_fragmentation_index  0.09 0.06 1.59 0.11   -0.02     0.21
num_tasks                     0.00 0.00 1.89 0.06   -0.00     0.00


=== Model B (Major FE) — selected coefficients ===
                              coef   se    t    p  ci_low  ci_high
exposure_fragmentation_index  0.03 0.06 0.56 0.58   -0.09     0.15
num_tasks                     0.00 0.00 1.87 0.06   -0.00     0.00


=== Model C (Minor FE) — selected coefficients ===
                              coef   se    t    p  ci_low  ci_high
exposure_fragmentation_index  0.07 0.07 1.06 0.29   -0.06     0.20
num_tasks                     0.00 0.00 1.71 0.09   -0.00     0.00


In [201]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    'exposure_fragmentation_index': 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

#Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['exposure_fragmentation_index', 'num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index + num_tasks + C(Major_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index + num_tasks + C(Minor_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                              coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index -0.33 0.11 -3.01 0.00   -0.54    -0.11
num_tasks                    -0.00 0.00 -4.11 0.00   -0.01    -0.00


=== Model B (Major FE) — selected coefficients ===
                              coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index -0.15 0.13 -1.20 0.23   -0.40     0.10
num_tasks                    -0.00 0.00 -3.73 0.00   -0.01    -0.00


=== Model C (Minor FE) — selected coefficients ===
                              coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index -0.28 0.15 -1.92 0.05   -0.57     0.01
num_tasks                    -0.00 0.00 -2.15 0.03   -0.00    -0.00
