#### By: Peyman Shahidi
#### Created: Jan 30, 2026
#### Last Edit: Feb 1, 2026

<br>

In [1]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 
import statsmodels.api as sm
import statsmodels.formula.api as smf


## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [2]:
# Set variables
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

ai_exposure_var = 'human_E1_fraction'

FREQUENT_TASKS = False  # Whether to use only frequent tasks or all tasks

In [3]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects/fragmentationIndex_robustness'
output_plot_path = f"{main_folder_path}/writeup/plots/fragmentationIndex_robustness"

In [4]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [5]:
def create_occupation_analysis(df, onet_occupation_code_var, onet_occupation_title_var):
    # Create occupation-level analysis for scatter plots
    # Group by occupation and calculate label fractions and task counts
    occupation_stats = []

    for (soc_code, occ_title), group in df.groupby([onet_occupation_code_var, onet_occupation_title_var]):
        num_tasks = group['Task ID'].nunique()
        # num_occupations = group[onet_occupation_code_var].nunique()
        total_tasks = len(group)
        
        manual_fraction = (group['label'] == 'Manual').sum() / total_tasks
        augmentation_fraction = (group['label'] == 'Augmentation').sum() / total_tasks  
        automation_fraction = (group['label'] == 'Automation').sum() / total_tasks
        ai_fraction = augmentation_fraction + automation_fraction
        gpt4_E0_fraction = (group['gpt4_exposure'] == 'E0').sum() / total_tasks
        gpt4_E1_fraction = (group['gpt4_exposure'] == 'E1').sum() / total_tasks
        gpt4_E2_fraction = (group['gpt4_exposure'] == 'E2').sum() / total_tasks
        gpt4_aiExposure_fraction = gpt4_E1_fraction + gpt4_E2_fraction
        human_E0_fraction = (group['human_labels'] == 'E0').sum() / total_tasks
        human_E1_fraction = (group['human_labels'] == 'E1').sum() / total_tasks
        human_E2_fraction = (group['human_labels'] == 'E2').sum() / total_tasks
        human_aiExposure_fraction = human_E1_fraction + human_E2_fraction

        
        occupation_stats.append({
            f'{onet_occupation_code_var}': soc_code,
            f'{onet_occupation_title_var}': occ_title,
            'num_tasks': num_tasks,
            # 'num_occupations': num_occupations,
            'manual_fraction': manual_fraction,
            'ai_fraction': ai_fraction,
            'augmentation_fraction': augmentation_fraction,
            'automation_fraction': automation_fraction,
            'gpt4_E0_fraction': gpt4_E0_fraction,
            'gpt4_E1_fraction': gpt4_E1_fraction,
            'gpt4_E2_fraction': gpt4_E2_fraction,
            'gpt4_aiExposure_fraction': gpt4_aiExposure_fraction,
            'human_E0_fraction': human_E0_fraction,
            'human_E1_fraction': human_E1_fraction,
            'human_E2_fraction': human_E2_fraction,
            'human_aiExposure_fraction': human_aiExposure_fraction
        })

    occupation_analysis = pd.DataFrame(occupation_stats)

    return occupation_analysis


# Create fragmentation index dataframe for different definitions
def construct_fragmentation_index(df, desired_definition=1, save_filename=None):
    # Definition 1: Separate Augmentation and Automation; AI Chain starts with Automation or Augmentation task and terminates at the first Augmentation task; Get number of switches between AI chains and Manual tasks
    # Definition 2: Treat all AI tasks similarly; Get number of switches between AI chains and Manual tasks
    # Definition 3: Same as Definition 1, but use exposure based label (E1) for forming the "AI-Chain"s
    # Definition 4: Same as Definition 1, but use exposure based label (E1 or E2) for forming the "AI-Chain"s
    fi_df = df.copy()
    
    # Definitions 2, 3, and 4, are basically similar in construction, only differing in the labeling of tasks
    if desired_definition != 1:
        if desired_definition == 2:
            # Use AI execution labels
            fi_df['is_ai'] = fi_df['label'].isin(['Augmentation', 'Automation']).astype(int)
        elif desired_definition == 3:
            # Use exposure based label (E1)
            fi_df['is_ai'] = fi_df['human_labels'].isin(['E1']).astype(int)
        elif desired_definition == 4:
            # Use exposure based label (E1 or E2)
            fi_df['is_ai'] = fi_df['human_labels'].isin(['E1', 'E2']).astype(int)
        
        # Create next_is_ai column within occupation groups
        fi_df['next_is_ai'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_ai'].shift(-1).fillna(0).astype(int)

        # Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
        fi_df['num_switches'] = 1
        fi_df.loc[(fi_df['is_ai'] == 1) & (fi_df['next_is_ai'] == 1), 'num_switches'] = 0

    else: # Definition 1
        fi_df['is_automated'] = fi_df['label'].isin(['Automation']).astype(int)
        fi_df['is_augmented'] = fi_df['label'].isin(['Augmentation']).astype(int)

        # Create next_is_automated column within occupation groups
        fi_df['next_is_automated'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_automated'].shift(-1).fillna(0).astype(int)
        fi_df['next_is_augmented'] = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['is_augmented'].shift(-1).fillna(0).astype(int)

        # Calculate FI using incremental counter: only if current task and next task is AI do not increment FI
        fi_df['num_switches'] = 1
        ai_chain_indicator = (fi_df['is_automated'] == 1) & ((fi_df['next_is_automated'] == 1) | (fi_df['next_is_augmented'] == 1))
        fi_df.loc[ai_chain_indicator, 'num_switches'] = 0

    # Now with a counter for number of switches, calculate fragmentation index per occupation as mean of num_switches per occupation
    fi_df = fi_df.groupby(['O*NET-SOC Code', 'Occupation Title'])['num_switches'].mean()
    fi_df = fi_df.reset_index().rename(columns={'num_switches': 'fragmentation_index'})

    # Save fragmentation index data
    if save_filename:
        fi_df.to_csv(f"{output_data_path}/{save_filename}", index=False)

    return fi_df



# Merge fragmentation data with occupation analysis
def merge_fragmentation_with_occupation_analysis(fi_df, occupation_analysis, SOC_mappings, onet_occupation_code_var, save_filename=None):
    # Merge fragmentation index with occupation analysis
    occupation_analysis = occupation_analysis.merge(fi_df, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

    # Save occupation analysis with fragmentation index
    if save_filename:
        occupation_analysis.to_csv(f"{output_data_path}/{save_filename}", index=False)

    # Merge SOC levels with the occupation analysis
    occupation_analysis = occupation_analysis.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

    return occupation_analysis



# Aggregate occupation analysis at the level of onet_occupation_code_var
def aggregate_occupation_analysis(occupation_analysis, onet_occupation_code_var, onet_occupation_title_var, SOC_mappings, ai_exposure_var):
    occupation_analysis_aggregated = occupation_analysis.groupby(
        [onet_occupation_code_var, onet_occupation_title_var]
    ).agg({
        'fragmentation_index': 'mean',
        ai_exposure_var: 'mean',
        'ai_fraction': 'mean',
        'num_tasks': 'mean'
    }).reset_index()

    # Merge SOC levels for FE
    occupation_analysis_aggregated = occupation_analysis_aggregated.merge(
        SOC_mappings, on=onet_occupation_code_var, how='left', suffixes=('', '_drop')
    )
    occupation_analysis_aggregated = occupation_analysis_aggregated.loc[:, ~occupation_analysis_aggregated.columns.str.endswith('_drop')]

    return occupation_analysis_aggregated

In [6]:
# Create SOC mappings to merge later
# Read original occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                    'Major_Group_Code', 'Major_Group_Title',
                    'Minor_Group_Code', 'Minor_Group_Title',
                    'Broad_Occupation_Code', 'Broad_Occupation_Title',
                    'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

In [7]:
import pandas as pd
import statsmodels.formula.api as smf

KEEP_TERMS = ["fragmentation_index", "ai_exposure"]

def results_to_tidy_df(res, *, x, definition, model_name, keep_terms=KEEP_TERMS):
    out = pd.DataFrame({
        "term": res.params.index,
        "coef": res.params.values,
        "se": res.bse.values,
        "t": res.tvalues.values,
        "p": res.pvalues.values,
    })

    # keep only the coefficients you care about
    out = out[out["term"].isin(keep_terms)].copy()

    out["prompt"] = x
    out["definition"] = definition
    out["model"] = model_name
    out["nobs"] = int(res.nobs)
    out["r2"] = float(res.rsquared)
    out["r2_adj"] = float(res.rsquared_adj)
    return out


master_results = []   # list of DataFrames, concat once at end


# Initialize the input file with the original data
input_file_path_list = [f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv"]

for x in range(1, 11): # Ignore 0 as it's the repetition of the original prompts
    input_file_path_list.append(f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT_{x}.csv")


for x, input_file_path in enumerate(input_file_path_list):
    merged_data = pd.read_csv(input_file_path)

    # # filter occupations
    # frequent_tasks_per_occupation_threshold = 3
    # occupation_task_counts = merged_data.groupby('O*NET-SOC Code')['Task ID'].nunique()
    # valid_occupations = occupation_task_counts[occupation_task_counts >= frequent_tasks_per_occupation_threshold].index
    # merged_data = merged_data[merged_data['O*NET-SOC Code'].isin(valid_occupations)].reset_index(drop=True)

    for definition in [1, 2, 3, 4]:
        occupation_analysis = create_occupation_analysis(merged_data, 'O*NET-SOC Code', 'Occupation Title')
        fi_df = construct_fragmentation_index(merged_data, desired_definition=definition)
        occupation_analysis = merge_fragmentation_with_occupation_analysis(fi_df, occupation_analysis, SOC_mappings, onet_occupation_code_var)
        occupation_analysis_aggregated = aggregate_occupation_analysis(occupation_analysis, onet_occupation_code_var, onet_occupation_title_var, SOC_mappings, ai_exposure_var)

        occupation_analysis_aggregated = occupation_analysis_aggregated.rename(columns={ai_exposure_var: 'ai_exposure'})
        groups = occupation_analysis_aggregated[onet_occupation_code_var]

        # Model A
        mod_noFE = smf.ols(
            "ai_fraction ~ fragmentation_index + ai_exposure",
            data=occupation_analysis_aggregated
        ).fit(cov_type="cluster", cov_kwds={"groups": groups, "use_correction": True, "df_correction": True})
        master_results.append(results_to_tidy_df(mod_noFE, x=x, definition=definition, model_name="noFE"))

        # Model B
        mod_majorFE = smf.ols(
            "ai_fraction ~ fragmentation_index + ai_exposure + C(Major_Group_Code)",
            data=occupation_analysis_aggregated
        ).fit(cov_type="cluster", cov_kwds={"groups": groups, "use_correction": True, "df_correction": True})
        master_results.append(results_to_tidy_df(mod_majorFE, x=x, definition=definition, model_name="majorFE"))

        # Model C
        mod_minorFE = smf.ols(
            "ai_fraction ~ fragmentation_index + ai_exposure + C(Minor_Group_Code)",
            data=occupation_analysis_aggregated
        ).fit(cov_type="cluster", cov_kwds={"groups": groups, "use_correction": True, "df_correction": True})
        master_results.append(results_to_tidy_df(mod_minorFE, x=x, definition=definition, model_name="minorFE"))

# one master df
master_df = pd.concat(master_results, ignore_index=True)
master_df.to_csv(f"{output_data_path}/fragmentation_index_robustness_combined.csv", index=False)

In [8]:
# master_df = pd.read_csv(f"{output_data_path}/fragmentation_index_robustness_combined.csv")

# Plot results
DEF_TO_EFI = {1: 3, 2: 4, 3: 1, 4: 2}

TERM_LABEL  = {"ai_exposure": "AI Exposure", "fragmentation_index": "EFI"}
MODEL_LABEL = {
    "noFE": "No Fixed Effects",
    "majorFE": "SOC Major Group Fixed Effects",
    "minorFE": "SOC Minor Group Fixed Effects",
}
TERM_COLOR  = {"ai_exposure": "#1f77b4", "fragmentation_index": "#C49000"}  # blue, dark yellow


def plot_six_coeffs(
    master_df,
    definition=1,
    models=("noFE", "majorFE", "minorFE"),
    terms=("ai_exposure", "fragmentation_index"),
    ylims_by_def=None,                  # {def: {term: (ymin, ymax)}} or {def: (ymin, ymax)} or {term: (ymin, ymax)}
    ci_mult=1.645,
    output_plot_path=".",
    dot_ms=6,
    original_ms=7.5,
    original_color="red",
    zero_color="#2b2b2b",               # darker grey-ish than default black
    legend_fs=13,
    title_fs=16,
    label_fs=14,
):
    df = master_df.copy()
    df["prompt"] = df["prompt"].astype(int)

    df = df[(df["definition"] == definition) & df["term"].isin(terms) & df["model"].isin(models)].copy()
    df = df.sort_values(["term", "model", "prompt"])

    fig, axes = plt.subplots(
        nrows=len(terms), ncols=len(models),
        figsize=(6 * len(models), 4.5 * len(terms)),
        sharex=True,
    )
    if len(terms) == 1:  axes = np.array([axes])
    if len(models) == 1: axes = axes.reshape(len(terms), 1)

    efi_num = DEF_TO_EFI.get(definition, definition)

    def _resolve_term_ylim(term):
        """Flexible y-lims resolver."""
        if ylims_by_def is None:
            return None
        # Case 1: ylims_by_def[definition] exists
        if isinstance(ylims_by_def, dict) and definition in ylims_by_def:
            v = ylims_by_def[definition]
            if isinstance(v, dict):   # {term: (ymin, ymax)}
                return v.get(term)
            return v                  # (ymin, ymax)
        # Case 2: ylims_by_def is {term: (ymin, ymax)}
        if isinstance(ylims_by_def, dict):
            return ylims_by_def.get(term)
        return None

    for r, term in enumerate(terms):
        term_color = TERM_COLOR.get(term, "#1f77b4")
        term_disp  = TERM_LABEL.get(term, term)
        term_ylim  = _resolve_term_ylim(term)  # compute once per row

        for c, model in enumerate(models):
            ax = axes[r, c]
            d = df[(df["term"] == term) & (df["model"] == model)].sort_values("prompt")

            if d.empty:
                if r == 0:
                    ax.set_title(MODEL_LABEL.get(model, model), fontsize=title_fs)
                ax.text(0.5, 0.5, "No data", ha="center", va="center")
                ax.set_axis_off()
                continue

            x = d["prompt"].to_numpy()
            y = d["coef"].to_numpy()
            ci = ci_mult * d["se"].to_numpy()
            yerr = np.vstack([ci, ci])

            mask0 = (x == 0)
            m = ~mask0

            # Non-zero prompts
            if m.any():
                ax.errorbar(
                    x[m], y[m], yerr=yerr[:, m],
                    fmt="o", ms=dot_ms, color=term_color, ecolor=term_color,
                    elinewidth=1.6, capsize=3.5, capthick=1.6, alpha=0.95,
                    zorder=3, label="Robustness Prompts + 90% CI",
                )

            # Prompt 0 highlighted
            if mask0.any():
                ax.errorbar(
                    x[mask0], y[mask0], yerr=yerr[:, mask0],
                    fmt="o", ms=original_ms, color=original_color, ecolor=original_color,
                    elinewidth=1.8, capsize=3.5, capthick=1.8, alpha=1.0,
                    zorder=5, label="Main Prompt + 90% CI",
                )

            # Mean + zero lines
            y_mean = float(np.nanmean(y))
            ax.axhline(y_mean, color=term_color, linestyle="--", lw=2, alpha=0.9,
                       label=f"Mean (across prompts) = {y_mean:.2f}")
            ax.axhline(0, color=zero_color, linestyle="--", lw=1.6, alpha=0.85)

            # Apply y-lims once per term-row (still OK to call per-ax, but resolved once)
            if term_ylim is not None:
                ax.set_ylim(*term_ylim)

            # Titles/labels
            if r == 0:
                ax.set_title(MODEL_LABEL.get(model, model), fontsize=title_fs)
            else:
                ax.set_title("")
            
            if c == 0:
                ax.set_ylabel(f"Estimated {term_disp} Coefficient", fontsize=label_fs)
            else:
                ax.set_ylabel("")
                ax.tick_params(axis="y", which="both", left=False, labelleft=False)

            ax.set_xlabel("" if r == 0 else "GPT Prompt", fontsize=label_fs)
            ax.legend(loc="best", fontsize=legend_fs)

    plt.tight_layout()
    plt.savefig(f"{output_plot_path}/fragmentation_index_robustness_definition_{definition}.png", dpi=600, bbox_inches="tight")
    plt.close()


# =========================
# USAGE
# =========================
YLIMS_BY_DEF = {
    1: {"ai_exposure": (-3.1, 1.1), "fragmentation_index": (-3.1, 1.1)},
    2: {"ai_exposure": (-3.1, 1.1), "fragmentation_index": (-3.1, 1.1)},
    3: {"ai_exposure": (-1.1, 1.1), "fragmentation_index": (-1.1, 1.1)},
    4: {"ai_exposure": (-1.1, 1.1), "fragmentation_index": (-1.1, 1.1)},
}

for d in (1, 2, 3, 4):
    plot_six_coeffs(master_df, definition=d, ylims_by_def=YLIMS_BY_DEF, output_plot_path=output_plot_path)