#### By: Peyman Shahidi
#### Created: Nov 7, 2025
#### Last Edit: Nov 10, 2025

<br>

In [469]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [470]:
FREQUENT_TASKS = False  # Whether to use only frequent tasks or all tasks

In [471]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects/fragmentationIndex_frequent' if FREQUENT_TASKS else f'{input_data_path}/computed_objects/fragmentationIndex'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [472]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [473]:
def create_occupation_analysis(df, onet_occupation_code_var, onet_occupation_title_var):
    # Create occupation-level analysis for scatter plots
    # Group by occupation and calculate label fractions and task counts
    occupation_stats = []

    for (soc_code, occ_title), group in df.groupby([onet_occupation_code_var, onet_occupation_title_var]):
        num_tasks = group['Task ID'].nunique()
        # num_occupations = group[onet_occupation_code_var].nunique()
        total_tasks = len(group)
        
        manual_fraction = (group['label'] == 'Manual').sum() / total_tasks
        augmentation_fraction = (group['label'] == 'Augmentation').sum() / total_tasks  
        automation_fraction = (group['label'] == 'Automation').sum() / total_tasks
        ai_fraction = augmentation_fraction + automation_fraction
        gpt4_E0_fraction = (group['gpt4_exposure'] == 'E0').sum() / total_tasks
        gpt4_E1_fraction = (group['gpt4_exposure'] == 'E1').sum() / total_tasks
        gpt4_E2_fraction = (group['gpt4_exposure'] == 'E2').sum() / total_tasks
        gpt4_aiExposure_fraction = gpt4_E1_fraction + gpt4_E2_fraction
        human_E0_fraction = (group['human_labels'] == 'E0').sum() / total_tasks
        human_E1_fraction = (group['human_labels'] == 'E1').sum() / total_tasks
        human_E2_fraction = (group['human_labels'] == 'E2').sum() / total_tasks
        human_aiExposure_fraction = human_E1_fraction + human_E2_fraction

        
        occupation_stats.append({
            f'{onet_occupation_code_var}': soc_code,
            f'{onet_occupation_title_var}': occ_title,
            'num_tasks': num_tasks,
            # 'num_occupations': num_occupations,
            'manual_fraction': manual_fraction,
            'ai_fraction': ai_fraction,
            'augmentation_fraction': augmentation_fraction,
            'automation_fraction': automation_fraction,
            'gpt4_E0_fraction': gpt4_E0_fraction,
            'gpt4_E1_fraction': gpt4_E1_fraction,
            'gpt4_E2_fraction': gpt4_E2_fraction,
            'gpt4_aiExposure_fraction': gpt4_aiExposure_fraction,
            'human_E0_fraction': human_E0_fraction,
            'human_E1_fraction': human_E1_fraction,
            'human_E2_fraction': human_E2_fraction,
            'human_aiExposure_fraction': human_aiExposure_fraction
        })

    occupation_analysis = pd.DataFrame(occupation_stats)

    return occupation_analysis

In [474]:
# Read the merged data
if not FREQUENT_TASKS:
    input_file_path = f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv" 
    merged_data = pd.read_csv(input_file_path)

        # Remove occupations with three or less frequent tasks
    frequent_tasks_per_occupation_threshold = 3
    occupation_task_counts = merged_data.groupby('O*NET-SOC Code')['Task ID'].nunique()
    valid_occupations = occupation_task_counts[occupation_task_counts >= frequent_tasks_per_occupation_threshold].index
    merged_data = merged_data[merged_data['O*NET-SOC Code'].isin(valid_occupations)].reset_index(drop=True)
else:
    input_file_path = f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT_frequent/ONET_Eloundou_Anthropic_GPT.csv"
    merged_data = pd.read_csv(input_file_path)

In [475]:
# Define levels and variables
onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']
dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']

## Calculate Fragmentation Index treating all AI tasks similarly and focusing on consecutive placements of AI tasks

In [476]:
# Calculate fragmentation index
# For the sake of this exercise we treat all AI tasks as a single category

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_ai column
fi_df = merged_data.copy()
fi_df['is_ai'] = fi_df['label'].isin(['Augmentation', 'Automation']).astype(int)
fi_df['is_ai'] = fi_df['human_labels'].isin(['E1', 'E2']).astype(int)


# Create next_is_ai column within occupation groups
fi_df['next_is_ai'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_ai'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
fi_df['fi_counter'] = 1
fi_df.loc[(fi_df['is_ai'] == 1) & (fi_df['next_is_ai'] == 1), 'fi_counter'] = 0

# Display fragmentation index for computer programmers
display(fi_df[['O*NET-SOC Code', 'Occupation Title', 'Task Position', 'label', 
               'is_ai', 'fi_counter']][fi_df['O*NET-SOC Code']=='15-1251.00'])


fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'fi_counter': 'fragmentation_index'})


# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex.csv", index=False)
display(fi_df.head(10))

# Display FI for certain occupations
display_list = ['15-1251.00', '27-3042.00', '27-3043.00']  # Computer Programmers, Technical Writers, Writers and Authors
display(fi_df[fi_df['O*NET-SOC Code'].isin(display_list)])
print(f'Average Fragmentation Index: {fi_df["fragmentation_index"].mean():.4f}')

Unnamed: 0,O*NET-SOC Code,Occupation Title,Task Position,label,is_ai,fi_counter
2168,15-1251.00,Computer Programmers,1,Augmentation,0,1
2169,15-1251.00,Computer Programmers,2,Manual,1,0
2170,15-1251.00,Computer Programmers,3,Automation,1,0
2171,15-1251.00,Computer Programmers,4,Augmentation,1,0
2172,15-1251.00,Computer Programmers,5,Automation,1,0
2173,15-1251.00,Computer Programmers,6,Manual,1,0
2174,15-1251.00,Computer Programmers,7,Augmentation,1,0
2175,15-1251.00,Computer Programmers,8,Manual,1,1
2176,15-1251.00,Computer Programmers,9,Augmentation,0,1
2177,15-1251.00,Computer Programmers,10,Automation,1,0


Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index
0,11-1011.00,Chief Executives,0.77
1,11-1011.03,Chief Sustainability Officers,0.5
2,11-1021.00,General and Operations Managers,0.71
3,11-2011.00,Advertising and Promotions Managers,0.38
4,11-2021.00,Marketing Managers,0.1
5,11-2022.00,Sales Managers,0.29
6,11-3012.00,Administrative Services Managers,0.5
7,11-3021.00,Computer and Information Systems Managers,0.41
8,11-3031.00,Financial Managers,0.29
9,11-3031.01,Treasurers and Controllers,0.5


Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index
105,15-1251.00,Computer Programmers,0.24
339,27-3042.00,Technical Writers,0.33
340,27-3043.00,Writers and Authors,0.62


Average Fragmentation Index: 0.7356


In [477]:
# Merge fragmentation index with occupation analysis
occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

# Save occupation analysis with fragmentation index
occupation_analysis.to_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex.csv", index=False)

In [478]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [479]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

ai_exposure_var = 'human_E1_fraction'

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    ai_exposure_var: 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

# Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['fragmentation_index', ai_exposure_var]

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var}',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",
    cov_kwds={
        "groups": occupation_analysis_aggregated[onet_occupation_code_var],
        "use_correction": True,
        "df_correction": True
    })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Major_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Minor_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c, vars_of_interest))



=== Model A — selected coefficients ===
                     coef   se     t    p  ci_low  ci_high
fragmentation_index -0.23 0.03 -7.25 0.00   -0.29    -0.16
human_E1_fraction    0.33 0.06  5.32 0.00    0.21     0.46


=== Model B (Major FE) — selected coefficients ===
                     coef   se     t    p  ci_low  ci_high
fragmentation_index -0.21 0.04 -5.77 0.00   -0.27    -0.14
human_E1_fraction    0.14 0.06  2.42 0.02    0.03     0.25


=== Model C (Minor FE) — selected coefficients ===
                     coef   se     t    p  ci_low  ci_high
fragmentation_index -0.14 0.04 -3.63 0.00   -0.21    -0.06
human_E1_fraction    0.12 0.06  1.99 0.05    0.00     0.24


#### Create Latex Table Output in Stata Format

In [480]:
from statsmodels.iolib.summary2 import summary_col

# ---- Combine models into a table ----
info_dict = {
    'N': lambda x: f"{int(x.nobs):,}",
    'Fixed Effects': lambda x: (
        'None' if 'C(Major_Group_Code)' not in x.model.formula and 'C(Minor_Group_Code)' not in x.model.formula
        else 'Major Group' if 'C(Major_Group_Code)' in x.model.formula
        else 'Minor Group'
    )
}

regressor_order = ['fragmentation_index', ai_exposure_var]

results_table = summary_col(
    results=[mod_a, mod_b, mod_c],
    float_format='%0.3f',
    stars=True,
    model_names=['(A)', '(B)', '(C)'],
    info_dict=info_dict,
    regressor_order=regressor_order,
    drop_omitted=True
)

results_table.tables[0].index = results_table.tables[0].index.str.replace('_', ' ')

latex_table = results_table.as_latex()

# --- Ensure a single \hline before R-squared ---
latex_table = latex_table.replace('\nR-squared', '\n\\hline\nR-squared')
latex_table = latex_table.replace('\n\\hline\nR-squared Adj.', '\nR-squared Adj.')

# --- Add centered note below final \hline (inside table) ---
note_text = (
    '\\\\[-1.25em]\n'
    '\\multicolumn{4}{l}{Standard errors in parentheses.} \\\\\n'
    '\\multicolumn{4}{l}{$^{*}:p<0.1$, $^{**}:p<0.05$, $^{***}:p<0.01$} \\\\\n'
)

latex_table = latex_table.replace(
    r'\hline' + '\n\\end{tabular}',
    r'\hline' + note_text + r'\end{tabular}'
)

print(latex_table)


\begin{table}
\caption{}
\label{}
\begin{center}
\begin{tabular}{llll}
\hline
                    & (A)       & (B)         & (C)          \\
\hline
fragmentation index & -0.225*** & -0.205***   & -0.138***    \\
                    & (0.031)   & (0.036)     & (0.038)      \\
human E1 fraction   & 0.334***  & 0.138**     & 0.121**      \\
                    & (0.063)   & (0.057)     & (0.060)      \\
\hline
R-squared           & 0.374     & 0.658       & 0.745        \\
R-squared Adj.      & 0.373     & 0.648       & 0.707        \\
Fixed Effects       & None      & Major Group & Minor Group  \\
N                   & 872       & 872         & 872          \\
\hline\\[-1.25em]
\multicolumn{4}{l}{Standard errors in parentheses.} \\
\multicolumn{4}{l}{$^{*}:p<0.1$, $^{**}:p<0.05$, $^{***}:p<0.01$} \\
\end{tabular}
\end{center}
\end{table}
\bigskip
Standard errors in parentheses. \newline 
* p<.1, ** p<.05, ***p<.01


### Redefine Fragmentation Index treating Chains as Run of Consecutive Automated Tasks terminated by an Augmented Task (Model Definition)

In [481]:
# Calculate fragmentation index
# For the sake of this exercise we define AI chains as a number of Automated tasks terminated by an Augmented Task

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_automated column
fi_df = merged_data.copy()
fi_df['is_automated'] = fi_df['label'].isin(['Automation']).astype(int)
fi_df['is_augmented'] = fi_df['label'].isin(['Augmentation']).astype(int)


# Create next_is_automated column within occupation groups
fi_df['next_is_automated'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_automated'].shift(-1).fillna(0).astype(int)
fi_df['next_is_augmented'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_augmented'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
# Determine if task is part of an AI chain
fi_df['fi_counter'] = 1
ai_chain_indicator = (fi_df['is_automated'] == 1) & ((fi_df['next_is_automated'] == 1) | (fi_df['next_is_augmented'] == 1))
fi_df.loc[ai_chain_indicator, 'fi_counter'] = 0

# Display fragmentation index for computer programmers
display(fi_df[['O*NET-SOC Code', 'Occupation Title', 'Task Position', 'label', 
               'is_automated', 'next_is_automated', 'next_is_augmented', 'fi_counter']][fi_df['O*NET-SOC Code']=='15-1251.00'])


fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'fi_counter': 'fragmentation_index'})

# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex_modelDefinition.csv", index=False)

# Display FI for certain occupations
display_list = ['15-1251.00', '27-3042.00', '27-3043.00']  # Computer Programmers, Technical Writers, Writers and Authors
display(fi_df[fi_df['O*NET-SOC Code'].isin(display_list)])
print(f'Average Fragmentation Index: {fi_df["fragmentation_index"].mean():.4f}')

Unnamed: 0,O*NET-SOC Code,Occupation Title,Task Position,label,is_automated,next_is_automated,next_is_augmented,fi_counter
2168,15-1251.00,Computer Programmers,1,Augmentation,0,0,0,1
2169,15-1251.00,Computer Programmers,2,Manual,0,1,0,1
2170,15-1251.00,Computer Programmers,3,Automation,1,0,1,0
2171,15-1251.00,Computer Programmers,4,Augmentation,0,1,0,1
2172,15-1251.00,Computer Programmers,5,Automation,1,0,0,1
2173,15-1251.00,Computer Programmers,6,Manual,0,0,1,1
2174,15-1251.00,Computer Programmers,7,Augmentation,0,0,0,1
2175,15-1251.00,Computer Programmers,8,Manual,0,0,1,1
2176,15-1251.00,Computer Programmers,9,Augmentation,0,1,0,1
2177,15-1251.00,Computer Programmers,10,Automation,1,1,0,0


Unnamed: 0,O*NET-SOC Code,Occupation Title,fragmentation_index
105,15-1251.00,Computer Programmers,0.82
339,27-3042.00,Technical Writers,0.93
340,27-3043.00,Writers and Authors,1.0


Average Fragmentation Index: 0.9874


In [482]:
# Merge fragmentation index with occupation analysis
occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

# Save occupation analysis with fragmentation index
occupation_analysis.to_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex_modelDefinition.csv", index=False)

In [483]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [484]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

ai_exposure_var = 'human_E1_fraction'

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    ai_exposure_var: 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

# Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['fragmentation_index', ai_exposure_var]

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var}',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Major_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c = smf.ols(
    formula=f'ai_fraction ~ fragmentation_index + {ai_exposure_var} + C(Minor_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c, vars_of_interest))



=== Model A — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -2.58 0.12 -22.28 0.00   -2.81    -2.36
human_E1_fraction    0.43 0.03  13.00 0.00    0.36     0.49


=== Model B (Major FE) — selected coefficients ===
                     coef   se      t    p  ci_low  ci_high
fragmentation_index -1.97 0.15 -13.06 0.00   -2.27    -1.68
human_E1_fraction    0.20 0.04   4.86 0.00    0.12     0.28


=== Model C (Minor FE) — selected coefficients ===
                     coef   se     t    p  ci_low  ci_high
fragmentation_index -1.68 0.17 -9.83 0.00   -2.02    -1.35
human_E1_fraction    0.15 0.05  3.24 0.00    0.06     0.24


In [485]:
from statsmodels.iolib.summary2 import summary_col

# ---- Combine models into a table ----
info_dict = {
    'N': lambda x: f"{int(x.nobs):,}",
    'Fixed Effects': lambda x: (
        'None' if 'C(Major_Group_Code)' not in x.model.formula and 'C(Minor_Group_Code)' not in x.model.formula
        else 'Major Group' if 'C(Major_Group_Code)' in x.model.formula
        else 'Minor Group'
    )
}

regressor_order = ['fragmentation_index', ai_exposure_var]

results_table = summary_col(
    results=[mod_a, mod_b, mod_c],
    float_format='%0.3f',
    stars=True,
    model_names=['(A)', '(B)', '(C)'],
    info_dict=info_dict,
    regressor_order=regressor_order,
    drop_omitted=True
)

results_table.tables[0].index = results_table.tables[0].index.str.replace('_', ' ')

latex_table = results_table.as_latex()

# --- Ensure a single \hline before R-squared ---
latex_table = latex_table.replace('\nR-squared', '\n\\hline\nR-squared')
latex_table = latex_table.replace('\n\\hline\nR-squared Adj.', '\nR-squared Adj.')

# --- Add centered note below final \hline (inside table) ---
note_text = (
    '\\\\[-1.25em]\n'
    '\\multicolumn{4}{l}{Clustered standard errors in parentheses.} \\\\\n'
    '\\multicolumn{4}{l}{$^{*}:p<0.1$, $^{**}:p<0.05$, $^{***}:p<0.01$} \\\\\n'
)

latex_table = latex_table.replace(
    r'\hline' + '\n\\end{tabular}',
    r'\hline' + note_text + r'\end{tabular}'
)

print(latex_table)


\begin{table}
\caption{}
\label{}
\begin{center}
\begin{tabular}{llll}
\hline
                    & (A)       & (B)         & (C)          \\
\hline
fragmentation index & -2.585*** & -1.971***   & -1.683***    \\
                    & (0.116)   & (0.151)     & (0.171)      \\
human E1 fraction   & 0.429***  & 0.201***    & 0.147***     \\
                    & (0.033)   & (0.041)     & (0.045)      \\
\hline
R-squared           & 0.651     & 0.743       & 0.798        \\
R-squared Adj.      & 0.651     & 0.736       & 0.769        \\
Fixed Effects       & None      & Major Group & Minor Group  \\
N                   & 872       & 872         & 872          \\
\hline\\[-1.25em]
\multicolumn{4}{l}{Clustered standard errors in parentheses.} \\
\multicolumn{4}{l}{$^{*}:p<0.1$, $^{**}:p<0.05$, $^{***}:p<0.01$} \\
\end{tabular}
\end{center}
\end{table}
\bigskip
Standard errors in parentheses. \newline 
* p<.1, ** p<.05, ***p<.01


### Create a fragmentation measure using AI Exposure instead of AI execution (treating just E1 tasks as exposed)

In [486]:
# Calculate fragmentation index
# For the sake of this exercise we treat all AI tasks as a single category

# Get occupation data
occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

# Create is_ai column
fi_df = merged_data.copy()
fi_df['is_ai_exposed'] = fi_df['human_labels'].isin(['E1']).astype(int)


# Create next_is_ai column within occupation groups
fi_df['next_is_ai_exposed'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_ai_exposed'].shift(-1).fillna(0).astype(int)

# Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
fi_df['exposure_fi_counter'] = 1
fi_df.loc[(fi_df['is_ai_exposed'] == 1) & (fi_df['next_is_ai_exposed'] == 1), 'exposure_fi_counter'] = 0

fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['exposure_fi_counter'].mean()
fi_df = fi_df.reset_index().rename(columns={'exposure_fi_counter': 'exposure_fragmentation_index'})

# Save fragmentation index data
fi_df.to_csv(f"{output_data_path}/fragmentationIndex_exposure.csv", index=False)
display(fi_df.head(10))

# Display FI for certain occupations
display_list = ['15-1251.00', '27-3042.00', '27-3043.00']  # Computer Programmers, Technical Writers, Writers and Authors
display(fi_df[fi_df['O*NET-SOC Code'].isin(display_list)])
print(f'Average Fragmentation Index: {fi_df["exposure_fragmentation_index"].mean():.4f}')

Unnamed: 0,O*NET-SOC Code,Occupation Title,exposure_fragmentation_index
0,11-1011.00,Chief Executives,0.94
1,11-1011.03,Chief Sustainability Officers,1.0
2,11-1021.00,General and Operations Managers,1.0
3,11-2011.00,Advertising and Promotions Managers,0.95
4,11-2021.00,Marketing Managers,0.9
5,11-2022.00,Sales Managers,1.0
6,11-3012.00,Administrative Services Managers,1.0
7,11-3021.00,Computer and Information Systems Managers,0.94
8,11-3031.00,Financial Managers,1.0
9,11-3031.01,Treasurers and Controllers,1.0


Unnamed: 0,O*NET-SOC Code,Occupation Title,exposure_fragmentation_index
105,15-1251.00,Computer Programmers,0.82
339,27-3042.00,Technical Writers,0.93
340,27-3043.00,Writers and Authors,0.75


Average Fragmentation Index: 0.9617


In [487]:
# Merge fragmentation index with occupation analysis
occupation_analysis = pd.read_csv(f"{output_data_path}/occupation_analysis_with_fragmentationIndex.csv")

occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [488]:
# First aggregate data at detailed_occupation level
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [489]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    'exposure_fragmentation_index': 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

#Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['exposure_fragmentation_index']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index + C(Major_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c = smf.ols(
    formula=f'fragmentation_index ~ exposure_fragmentation_index + C(Minor_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c, vars_of_interest))



=== Model A — selected coefficients ===
                              coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index  2.25 0.17 12.98 0.00    1.91     2.59


=== Model B (Major FE) — selected coefficients ===
                              coef   se    t    p  ci_low  ci_high
exposure_fragmentation_index  0.95 0.12 8.19 0.00    0.72     1.18


=== Model C (Minor FE) — selected coefficients ===
                              coef   se    t    p  ci_low  ci_high
exposure_fragmentation_index  0.72 0.13 5.37 0.00    0.46     0.98


## Ad hoc

In [490]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Aggregate
occupation_analysis_aggregated = occupation_analysis.groupby(
    [onet_occupation_code_var, onet_occupation_title_var]
).agg({
    'fragmentation_index': 'mean',
    'exposure_fragmentation_index': 'mean',
    'ai_fraction': 'mean',
    'num_tasks': 'mean'
}).reset_index()

#Merge SOC levels for FE
occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
    SOC_mappings, on=onet_occupation_code_var, how='left'
)

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['exposure_fragmentation_index']

# --- Model A: no FE ---
mod_a = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model A (no FE) — FULL SUMMARY ===")
# print(mod_a.summary2())   # full details
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))

# --- Model B (Major group FE) ---
mod_b = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index + C(Major_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model B (Major FE) — FULL SUMMARY ===")
# print(mod_b.summary2())
print("\n\n=== Model B (Major FE) — selected coefficients ===")
print(coef_table(mod_b, vars_of_interest))

# --- Model C (Minor group FE) ---
mod_c = smf.ols(
    formula=f'ai_fraction ~ exposure_fragmentation_index + C(Minor_Group_Code)',
    data=occupation_analysis_aggregated
).fit(cov_type="cluster",     cov_kwds={         "groups": occupation_analysis_aggregated[onet_occupation_code_var],         "use_correction": True,         "df_correction": True     })
# print("\n=== Model C (Minor FE) — FULL SUMMARY ===")
# print(mod_c.summary2())
print("\n\n=== Model C (Minor FE) — selected coefficients ===")
print(coef_table(mod_c, vars_of_interest))



=== Model A — selected coefficients ===
                              coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index -1.15 0.12 -9.77 0.00   -1.38    -0.92


=== Model B (Major FE) — selected coefficients ===
                              coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index -0.43 0.10 -4.36 0.00   -0.63    -0.24


=== Model C (Minor FE) — selected coefficients ===
                              coef   se     t    p  ci_low  ci_high
exposure_fragmentation_index -0.25 0.10 -2.44 0.01   -0.45    -0.05
