#### By: Peyman Shahidi
#### Created: Nov 7, 2025
#### Last Edit: Dec 13, 2025

<br>

In [1]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 
import statsmodels.api as sm
import statsmodels.formula.api as smf


## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [2]:
# Set variables
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

ai_exposure_var = 'human_E1_fraction'

FREQUENT_TASKS = False  # Whether to use only frequent tasks or all tasks

In [3]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects/fragmentationIndex_frequent' if FREQUENT_TASKS else f'{input_data_path}/computed_objects/fragmentationIndex'
output_plot_path = f"{main_folder_path}/writeup/plots/fragmentationIndex_frequent" if FREQUENT_TASKS else f"{main_folder_path}/writeup/plots/fragmentationIndex"

In [4]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [5]:
# Read the merged data
if not FREQUENT_TASKS:
    input_file_path = f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv" 
    merged_data = pd.read_csv(input_file_path)

        # Remove occupations with three or less frequent tasks
    frequent_tasks_per_occupation_threshold = 3
    occupation_task_counts = merged_data.groupby('O*NET-SOC Code')['Task ID'].nunique()
    valid_occupations = occupation_task_counts[occupation_task_counts >= frequent_tasks_per_occupation_threshold].index
    merged_data = merged_data[merged_data['O*NET-SOC Code'].isin(valid_occupations)].reset_index(drop=True)
else:
    input_file_path = f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT_frequent/ONET_Eloundou_Anthropic_GPT.csv"
    merged_data = pd.read_csv(input_file_path)

In [6]:
# Create SOC mappings to merge later
# Read original occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                    'Major_Group_Code', 'Major_Group_Title',
                    'Minor_Group_Code', 'Minor_Group_Title',
                    'Broad_Occupation_Code', 'Broad_Occupation_Title',
                    'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

In [7]:
def create_occupation_analysis(df, onet_occupation_code_var, onet_occupation_title_var):
    # Create occupation-level analysis for scatter plots
    # Group by occupation and calculate label fractions and task counts
    occupation_stats = []

    for (soc_code, occ_title), group in df.groupby([onet_occupation_code_var, onet_occupation_title_var]):
        num_tasks = group['Task ID'].nunique()
        # num_occupations = group[onet_occupation_code_var].nunique()
        total_tasks = len(group)
        
        manual_fraction = (group['label'] == 'Manual').sum() / total_tasks
        augmentation_fraction = (group['label'] == 'Augmentation').sum() / total_tasks  
        automation_fraction = (group['label'] == 'Automation').sum() / total_tasks
        ai_fraction = augmentation_fraction + automation_fraction
        gpt4_E0_fraction = (group['gpt4_exposure'] == 'E0').sum() / total_tasks
        gpt4_E1_fraction = (group['gpt4_exposure'] == 'E1').sum() / total_tasks
        gpt4_E2_fraction = (group['gpt4_exposure'] == 'E2').sum() / total_tasks
        gpt4_aiExposure_fraction = gpt4_E1_fraction + gpt4_E2_fraction
        human_E0_fraction = (group['human_labels'] == 'E0').sum() / total_tasks
        human_E1_fraction = (group['human_labels'] == 'E1').sum() / total_tasks
        human_E2_fraction = (group['human_labels'] == 'E2').sum() / total_tasks
        human_aiExposure_fraction = human_E1_fraction + human_E2_fraction

        
        occupation_stats.append({
            f'{onet_occupation_code_var}': soc_code,
            f'{onet_occupation_title_var}': occ_title,
            'num_tasks': num_tasks,
            # 'num_occupations': num_occupations,
            'manual_fraction': manual_fraction,
            'ai_fraction': ai_fraction,
            'augmentation_fraction': augmentation_fraction,
            'automation_fraction': automation_fraction,
            'gpt4_E0_fraction': gpt4_E0_fraction,
            'gpt4_E1_fraction': gpt4_E1_fraction,
            'gpt4_E2_fraction': gpt4_E2_fraction,
            'gpt4_aiExposure_fraction': gpt4_aiExposure_fraction,
            'human_E0_fraction': human_E0_fraction,
            'human_E1_fraction': human_E1_fraction,
            'human_E2_fraction': human_E2_fraction,
            'human_aiExposure_fraction': human_aiExposure_fraction
        })

    occupation_analysis = pd.DataFrame(occupation_stats)

    return occupation_analysis


# Create fragmentation index dataframe for different definitions
def construct_fragmentation_index(df, desired_definition=1, save_filename=None):
    # Definition 1: Separate Augmentation and Automation; AI Chain starts with Automation or Augmentation task and terminates at the first Augmentation task; Get number of switches between AI chains and Manual tasks
    # Definition 2: Treat all AI tasks similarly; Get number of switches between AI chains and Manual tasks
    # Definition 3: Same as Definition 1, but use exposure based label (E1) for forming the "AI-Chain"s
    # Definition 4: Same as Definition 1, but use exposure based label (E1 or E2) for forming the "AI-Chain"s
    fi_df = df.copy()
    
    # Definitions 2, 3, and 4, are basically similar in construction, only differing in the labeling of tasks
    if desired_definition != 1:
        if desired_definition == 2:
            # Use AI execution labels
            fi_df['is_ai'] = fi_df['label'].isin(['Augmentation', 'Automation']).astype(int)
        elif desired_definition == 3:
            # Use exposure based label (E1)
            fi_df['is_ai'] = fi_df['human_labels'].isin(['E1']).astype(int)
        elif desired_definition == 4:
            # Use exposure based label (E1 or E2)
            fi_df['is_ai'] = fi_df['human_labels'].isin(['E1', 'E2']).astype(int)
        
        # Create next_is_ai column within occupation groups
        fi_df['next_is_ai'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_ai'].shift(-1).fillna(0).astype(int)

        # Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
        fi_df['num_switches'] = 1
        fi_df.loc[(fi_df['is_ai'] == 1) & (fi_df['next_is_ai'] == 1), 'num_switches'] = 0

    else: # Definition 1
        fi_df['is_automated'] = fi_df['label'].isin(['Automation']).astype(int)
        fi_df['is_augmented'] = fi_df['label'].isin(['Augmentation']).astype(int)

        # Create next_is_automated column within occupation groups
        fi_df['next_is_automated'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_automated'].shift(-1).fillna(0).astype(int)
        fi_df['next_is_augmented'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_augmented'].shift(-1).fillna(0).astype(int)

        # Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
        fi_df['num_switches'] = 1
        ai_chain_indicator = (fi_df['is_automated'] == 1) & ((fi_df['next_is_automated'] == 1) | (fi_df['next_is_augmented'] == 1))
        fi_df.loc[ai_chain_indicator, 'num_switches'] = 0

    # Now with a counter for number of switches, calculate fragmentation index per occupation as mean of num_switches per occupation
    fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['num_switches'].mean()
    fi_df = fi_df.reset_index().rename(columns={'num_switches': 'fragmentation_index'})

    # Save fragmentation index data
    if save_filename:
        fi_df.to_csv(f"{output_data_path}/{save_filename}", index=False)

    return fi_df



# Merge fragmentation data with occupation analysis
def merge_fragmentation_with_occupation_analysis(fi_df, occupation_analysis, SOC_mappings, onet_occupation_code_var, save_filename=None):
    # Merge fragmentation index with occupation analysis
    occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

    # Save occupation analysis with fragmentation index
    if save_filename:
        occupation_analysis.to_csv(f"{output_data_path}/{save_filename}", index=False)

    # Merge SOC levels with the occupation analysis
    occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

    return occupation_analysis



# Aggregate occupation analysis at the level of onet_occupation_code_var
def aggregate_occupation_analysis(occupation_analysis, onet_occupation_code_var, onet_occupation_title_var, SOC_mappings, ai_exposure_var):
    occupation_analysis_aggregated = occupation_analysis.groupby(
        [onet_occupation_code_var, onet_occupation_title_var]
    ).agg({
        'fragmentation_index': 'mean',
        ai_exposure_var: 'mean',
        'ai_fraction': 'mean',
        'num_tasks': 'mean'
    }).reset_index()

    # Merge SOC levels for FE
    occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
        SOC_mappings, on=onet_occupation_code_var, how='left', suffixes=('', '_drop')
    )
    occupation_analysis_aggregated = occupation_analysis_aggregated.loc[:, ~occupation_analysis_aggregated.columns.str.endswith('_drop')]

    return occupation_analysis_aggregated

## Calculate Fragmentation Index for 4 Definitions and Run the Regression

In [8]:
regression_results = []

# Create dataset and run regressions for all 4 definitions
for definition in [1, 2, 3, 4]:
    # Get occupation data
    occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')

    # Get fragmentation index
    fi_df = construct_fragmentation_index(merged_data, desired_definition=definition, save_filename=f'fragmentationIndex_def{definition}.csv')

    # Merge fragmentation data with occupation analysis
    occupation_analysis = merge_fragmentation_with_occupation_analysis(fi_df, occupation_analysis, SOC_mappings, onet_occupation_code_var, save_filename=f'occupation_analysis_with_fragmentationIndex_def{definition}.csv')

    # Aggregate data at the onet_occupation_code_var level and add back SOC code information
    occupation_analysis_aggregated = aggregate_occupation_analysis(occupation_analysis, onet_occupation_code_var, onet_occupation_title_var, SOC_mappings, ai_exposure_var)

    # Rename regression columns for clarity
    occupation_analysis_aggregated = occupation_analysis_aggregated.rename(columns={
        ai_exposure_var: 'ai_exposure'
    })

    # Run regressions, clustering standard errors at the onet_occupation_code_var level
    # --- Model A: no FE ---
    mod_noFE = smf.ols(
        formula=f'ai_fraction ~ fragmentation_index + ai_exposure',
        data=occupation_analysis_aggregated
        ).fit(cov_type="cluster",
            cov_kwds={"groups": occupation_analysis_aggregated[onet_occupation_code_var],
                        "use_correction": True,
                        "df_correction": True}
                        )
    regression_results.append(mod_noFE)

    # --- Model B (Major group FE) ---
    mod_majorFE = smf.ols(
        formula=f'ai_fraction ~ fragmentation_index + ai_exposure + C(Major_Group_Code)',
        data=occupation_analysis_aggregated
        ).fit(cov_type="cluster",
            cov_kwds={"groups": occupation_analysis_aggregated[onet_occupation_code_var],
                        "use_correction": True,
                        "df_correction": True}
                        )
    regression_results.append(mod_majorFE)

    # --- Model C (Minor group FE) ---
    mod_minorFE = smf.ols(
        formula=f'ai_fraction ~ fragmentation_index + ai_exposure + C(Minor_Group_Code)',
        data=occupation_analysis_aggregated
        ).fit(cov_type="cluster",
            cov_kwds={"groups": occupation_analysis_aggregated[onet_occupation_code_var],
                        "use_correction": True,
                        "df_correction": True}
                        )
    regression_results.append(mod_minorFE)