#### By: Peyman Shahidi
#### Created: Nov 7, 2025
#### Last Edit: Nov 7, 2025

<br>

In [442]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [443]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects/fragmentationIndex'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [444]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [445]:
def create_occupation_analysis(df, onet_occupation_code_var, onet_occupation_title_var):
    # Create occupation-level analysis for scatter plots
    # Group by occupation and calculate label fractions and task counts
    occupation_stats = []

    for (soc_code, occ_title), group in df.groupby([onet_occupation_code_var, onet_occupation_title_var]):
        num_tasks = group['Task ID'].nunique()
        # num_occupations = group[onet_occupation_code_var].nunique()
        total_tasks = len(group)
        
        manual_fraction = (group['label'] == 'Manual').sum() / total_tasks
        augmentation_fraction = (group['label'] == 'Augmentation').sum() / total_tasks  
        automation_fraction = (group['label'] == 'Automation').sum() / total_tasks
        ai_fraction = augmentation_fraction + automation_fraction
        gpt4_E0_fraction = (group['gpt4_exposure'] == 'E0').sum() / total_tasks
        gpt4_E1_fraction = (group['gpt4_exposure'] == 'E1').sum() / total_tasks
        gpt4_E2_fraction = (group['gpt4_exposure'] == 'E2').sum() / total_tasks
        gpt4_aiExposure_fraction = gpt4_E1_fraction + gpt4_E2_fraction
        human_E0_fraction = (group['human_labels'] == 'E0').sum() / total_tasks
        human_E1_fraction = (group['human_labels'] == 'E1').sum() / total_tasks
        human_E2_fraction = (group['human_labels'] == 'E2').sum() / total_tasks
        human_aiExposure_fraction = human_E1_fraction + human_E2_fraction

        
        occupation_stats.append({
            f'{onet_occupation_code_var}': soc_code,
            f'{onet_occupation_title_var}': occ_title,
            'num_tasks': num_tasks,
            # 'num_occupations': num_occupations,
            'manual_fraction': manual_fraction,
            'ai_fraction': ai_fraction,
            'augmentation_fraction': augmentation_fraction,
            'automation_fraction': automation_fraction,
            'gpt4_E0_fraction': gpt4_E0_fraction,
            'gpt4_E1_fraction': gpt4_E1_fraction,
            'gpt4_E2_fraction': gpt4_E2_fraction,
            'gpt4_aiExposure_fraction': gpt4_aiExposure_fraction,
            'human_E0_fraction': human_E0_fraction,
            'human_E1_fraction': human_E1_fraction,
            'human_E2_fraction': human_E2_fraction,
            'human_aiExposure_fraction': human_aiExposure_fraction
        })

    occupation_analysis = pd.DataFrame(occupation_stats)

    return occupation_analysis

In [446]:
# Read the merged data
merged_data = pd.read_csv(f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv")

In [447]:
# # Drop the supplemental tasks
# merged_data = merged_data[merged_data['Task Type'] != 'Supplemental'].reset_index(drop=True)

# # Drop rows whose Occupation Title includes 'Teachers, Postsecondary'
# merged_data = merged_data[~merged_data['Occupation Title'].str.contains('Teachers, Postsecondary')].reset_index(drop=True)

In [448]:
# Define levels and variables
onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']
dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']

## Calculate Fragmentation Index treating all AI tasks similarly and focusing on consecutive placements of AI tasks

In [449]:
# Calculate fragmentation index
# For the sake of this exercise we treat all AI tasks as a single category

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_ai column
fi_df = merged_data.copy()
fi_df['is_ai'] = fi_df['label'].isin(['Augmentation', 'Automation']).astype(int)


# Create next_is_ai column within occupation groups
fi_df['next_is_ai'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_ai'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
fi_df['fi_counter'] = 1
fi_df.loc[(fi_df['is_ai'] == 1) & (fi_df['next_is_ai'] == 1), 'fi_counter'] = 0

fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'fi_counter': 'fragmentation_index'})

# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex.csv", index=False)
display(fi_df.head(10))
fi_df[fi_df['O*NET-SOC Code']=='15-1251.00']

Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index
0,11-1011.00,Chief Executives,0.97
1,11-1011.03,Chief Sustainability Officers,0.94
2,11-1021.00,General and Operations Managers,1.0
3,11-2011.00,Advertising and Promotions Managers,1.0
4,11-2021.00,Marketing Managers,1.0
5,11-2022.00,Sales Managers,1.0
6,11-3012.00,Administrative Services Managers,1.0
7,11-3021.00,Computer and Information Systems Managers,1.0
8,11-3031.00,Financial Managers,0.9
9,11-3031.01,Treasurers and Controllers,1.0


Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index
105,15-1251.00,Computer Programmers,0.71


In [450]:
# Merge fragmentation index with occupation analysis
occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

# Save occupation analysis with fragmentation index
occupation_analysis.to_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex.csv", index=False)

In [451]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [452]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

ai_exposure_var = 'human_E1_fraction'

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    ai_exposure_var: 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

# Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['fragmentation_index', ai_exposure_var, 'num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Major_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Minor_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -1.47 0.04 -34.05 0.00   -1.56    -1.39
human_E1_fraction    0.26 0.02  13.70 0.00    0.22     0.29
num_tasks            0.00 0.00   1.98 0.05    0.00     0.00


=== Model B (Major FE) — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -1.29 0.05 -28.64 0.00   -1.38    -1.20
human_E1_fraction    0.13 0.02   5.09 0.00    0.08     0.17
num_tasks            0.00 0.00   0.19 0.85   -0.00     0.00


=== Model C (Minor FE) — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -1.24 0.05 -24.63 0.00   -1.34    -1.14
human_E1_fraction    0.11 0.03   3.79 0.00    0.05     0.17
num_tasks            0.00 0.00   0.20 0.84   -0.00     0.00


### Redefine Fragmentation Index treating Chains as Run of Consecutive Automated Tasks terminated by an Augmented Task

In [453]:
# Calculate fragmentation index
# For the sake of this exercise we define AI chains as a number of Automated tasks terminated by an Augmented Task

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_automated column
fi_df = merged_data.copy()
fi_df['is_automated'] = fi_df['label'].isin(['Automation']).astype(int)
fi_df['is_augmented'] = fi_df['label'].isin(['Augmentation']).astype(int)


# Create next_is_automated column within occupation groups
fi_df['next_is_automated'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_automated'].shift(-1).fillna(0).astype(int)
fi_df['next_is_augmented'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_augmented'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
fi_df['fi_counter'] = 1
fi_df.loc[(fi_df['is_automated'] == 1) & ((fi_df['next_is_automated'] == 1) | (fi_df['next_is_augmented'] == 1)), 'fi_counter'] = 0

# Display fragmentation index for computer programmers
display(fi_df[['O*NET-SOC Code', 'Occupation Title', 'Task Position', 'label', 
               'is_automated', 'next_is_automated', 'next_is_augmented', 'fi_counter']][fi_df['O*NET-SOC Code']=='15-1251.00'])


fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'fi_counter': 'fragmentation_index'})

# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex_modelDefinition.csv", index=False)
fi_df[fi_df['O*NET-SOC Code']=='15-1251.00']

Unnamed: 0,O*NET-SOC Code,Occupation Title,Task Position,label,is_automated,next_is_automated,next_is_augmented,fi_counter
2168,15-1251.00,Computer Programmers,1,Augmentation,0,0,0,1
2169,15-1251.00,Computer Programmers,2,Manual,0,1,0,1
2170,15-1251.00,Computer Programmers,3,Automation,1,0,1,0
2171,15-1251.00,Computer Programmers,4,Augmentation,0,1,0,1
2172,15-1251.00,Computer Programmers,5,Automation,1,0,0,1
2173,15-1251.00,Computer Programmers,6,Manual,0,0,1,1
2174,15-1251.00,Computer Programmers,7,Augmentation,0,0,0,1
2175,15-1251.00,Computer Programmers,8,Manual,0,0,1,1
2176,15-1251.00,Computer Programmers,9,Augmentation,0,1,0,1
2177,15-1251.00,Computer Programmers,10,Automation,1,1,0,0


Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index
105,15-1251.00,Computer Programmers,0.82


In [454]:
# Merge fragmentation index with occupation analysis
occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

# Save occupation analysis with fragmentation index
occupation_analysis.to_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex_modelDefinition.csv", index=False)

In [455]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [456]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

ai_exposure_var = 'human_E1_fraction'

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    ai_exposure_var: 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

# Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['fragmentation_index', ai_exposure_var, 'num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Major_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Minor_Group_Code) + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -2.58 0.12 -22.33 0.00   -2.81    -2.36
human_E1_fraction    0.43 0.03  14.74 0.00    0.37     0.49
num_tasks            0.00 0.00   0.00 1.00   -0.00     0.00


=== Model B (Major FE) — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -1.96 0.15 -12.74 0.00   -2.26    -1.66
human_E1_fraction    0.20 0.04   5.50 0.00    0.13     0.27
num_tasks           -0.00 0.00  -1.57 0.12   -0.00     0.00


=== Model C (Minor FE) — selected coefficients ===
                     coef   se     t    p  ci_low  ci_high
fragmentation_index -1.68 0.18 -9.56 0.00   -2.02    -1.33
human_E1_fraction    0.15 0.04  3.38 0.00    0.06     0.24
num_tasks           -0.00 0.00 -1.07 0.29   -0.00     0.00


### Create a fragmentation measure using AI Exposure instead of AI execution (treating just E1 tasks as exposed)

In [457]:
# Calculate fragmentation index
# For the sake of this exercise we treat all AI tasks as a single category

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_ai column
fi_df = merged_data.copy()
fi_df['is_ai_exposed'] = fi_df['human_labels'].isin(['E1']).astype(int)


# Create next_is_ai column within occupation groups
fi_df['next_is_ai_exposed'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_ai_exposed'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
fi_df['exposure_fi_counter'] = 1
fi_df.loc[(fi_df['is_ai_exposed'] == 1) & (fi_df['next_is_ai_exposed'] == 1), 'exposure_fi_counter'] = 0

fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['exposure_fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'exposure_fi_counter': 'exposure_fragmentation_index'})

# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex_exposure.csv", index=False)
display(fi_df.head(10))
fi_df[fi_df['O*NET-SOC Code']=='15-1251.00']

Unnamed: 0,O*NET-SOC Code,Occupation Title,exposure_fragmentation_index
0,11-1011.00,Chief Executives,0.94
1,11-1011.03,Chief Sustainability Officers,1.0
2,11-1021.00,General and Operations Managers,1.0
3,11-2011.00,Advertising and Promotions Managers,0.95
4,11-2021.00,Marketing Managers,0.9
5,11-2022.00,Sales Managers,1.0
6,11-3012.00,Administrative Services Managers,1.0
7,11-3021.00,Computer and Information Systems Managers,0.94
8,11-3031.00,Financial Managers,1.0
9,11-3031.01,Treasurers and Controllers,1.0


Unnamed: 0,O*NET-SOC Code,Occupation Title,exposure_fragmentation_index
105,15-1251.00,Computer Programmers,0.82


In [458]:
# Merge fragmentation index with occupation analysis
occupation_analysis = pd.read_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex.csv")

occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [459]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [460]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    'exposure_fragmentation_index': 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

#Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['exposure_fragmentation_index', 'num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index + num_tasks + C(Major_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index + num_tasks + C(Minor_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                              coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index  0.52 0.07  7.66 0.00    0.39     0.65
num_tasks                    -0.00 0.00 -1.52 0.13   -0.00     0.00


=== Model B (Major FE) — selected coefficients ===
                              coef   se    t    p  ci_low  ci_high
exposure_fragmentation_index  0.23 0.07 3.44 0.00    0.10     0.36
num_tasks                     0.00 0.00 3.02 0.00    0.00     0.00


=== Model C (Minor FE) — selected coefficients ===
                              coef   se    t    p  ci_low  ci_high
exposure_fragmentation_index  0.12 0.07 1.74 0.08   -0.02     0.26
num_tasks                     0.00 0.00 2.05 0.04    0.00     0.00


In [461]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    'exposure_fragmentation_index': 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

#Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['exposure_fragmentation_index', 'num_tasks']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index + num_tasks',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b_major = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index + num_tasks + C(Major_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b_major.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b_major, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c_minor = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index + num_tasks + C(Minor_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type='HC3')
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c_minor.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c_minor, vars_of_interest))



=== Model A — selected coefficients ===
                              coef   se      t    p  ci_low  ci_high
exposure_fragmentation_index -1.14 0.11 -10.61 0.00   -1.35    -0.93
num_tasks                     0.00 0.00   2.57 0.01    0.00     0.00


=== Model B (Major FE) — selected coefficients ===
                              coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index -0.43 0.09 -5.05 0.00   -0.60    -0.26
num_tasks                    -0.00 0.00 -2.34 0.02   -0.00    -0.00


=== Model C (Minor FE) — selected coefficients ===
                              coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index -0.25 0.09 -2.67 0.01   -0.44    -0.07
num_tasks                    -0.00 0.00 -1.30 0.19   -0.00     0.00
