#### By: Peyman Shahidi
#### Created: Dec 16, 2025
#### Last Edit: Jan 31, 2026

<br>

In [1]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [2]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data/computed_objects/tasks_sequences_robustness"
output_data_path = f'{input_data_path}/data/computed_objects/GPT_task_sequences_overlap_analysis'
output_plot_path = f"{main_folder_path}/writeup/plots/GPT_task_sequences_overlap_analysis"

In [3]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

### Main Code

In [4]:
from pathlib import Path
import pandas as pd
import re

if Path(f"{output_data_path}/task_orderings_combined.csv").exists():
    master_df = pd.read_csv(f"{output_data_path}/task_orderings_combined.csv")
else:
    base_dir = Path(input_data_path)
    dfs = []

    for folder in base_dir.iterdir():
        if not folder.is_dir():
            continue

        occupation = folder.name

        for csv_file in folder.glob("*.csv"):
            # Expect: <occupation>_X.csv
            match = re.search(rf"{re.escape(occupation)}_(\d+)\.csv$", csv_file.name)
            if match is None:
                continue  # skip files that don't match the convention

            prompt_number = int(match.group(1))

            df = pd.read_csv(csv_file)
            df["prompt_number"] = prompt_number

            dfs.append(df)

    # Combine everything
    master_df = pd.concat(dfs, ignore_index=True)

    # Save 
    master_df.to_csv(f"{output_data_path}/task_orderings_combined.csv", index=False)

In [5]:
from scipy.stats import kendalltau
import itertools
import pandas as pd

def find_bad_prompts(df_occ):
    bad = (
        df_occ
        .groupby(["prompt_number", "Task ID"])
        .size()
        .reset_index(name="n")
        .query("n > 1")
        ["prompt_number"]
        .unique()
    )
    return set(bad)


def compute_pairwise_kendall_drop_bad(df_occ):

    bad_prompts = find_bad_prompts(df_occ)

    df_clean = df_occ[~df_occ["prompt_number"].isin(bad_prompts)]

    # need at least 2 prompts
    if df_clean["prompt_number"].nunique() < 2:
        return pd.DataFrame()

    rankings = {}

    for p, g in df_clean.groupby("prompt_number"):
        rankings[p] = (
            g.sort_values("Task Position")
             .set_index("Task ID")["Task Position"]
        )

    results = []

    for p1, p2 in itertools.combinations(rankings.keys(), 2):
        r1 = rankings[p1]
        r2 = rankings[p2]

        common_tasks = r1.index.intersection(r2.index)

        if len(common_tasks) < 2:
            continue

        tau, _ = kendalltau(
            r1.loc[common_tasks].values,
            r2.loc[common_tasks].values
        )

        results.append({
            "prompt_1": p1,
            "prompt_2": p2,
            "kendall_tau": tau
        })

    return pd.DataFrame(results)



if Path(f"{output_data_path}/GPT_task_sequences_kendall_results.csv").exists():
    kendall_results = pd.read_csv(f"{output_data_path}/GPT_task_sequences_kendall_results.csv")
else:
    all_results = []

    for occ, df_occ in master_df.groupby("Occupation Title"):
        res = compute_pairwise_kendall_drop_bad(df_occ)
        if not res.empty:
            res["Occupation Title"] = occ
            all_results.append(res)

    kendall_results = pd.concat(all_results, ignore_index=True)
    kendall_results.to_csv(f"{output_data_path}/GPT_task_sequences_kendall_results.csv", index=False)

In [6]:
# Read merged data for occupation codes and titles to map to occupational analysis dataset later
merged_data = pd.read_csv(f"{main_folder_path}/data/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv")
merged_data = merged_data[["Detailed_Occupation_Title", "Detailed_Occupation_Code", "O*NET-SOC Code"]].drop_duplicates()

# Read occupation-level stats and merge back detailed_occupation_title and codes
occupation_analysis = pd.read_csv(f"{main_folder_path}/data/computed_objects/fragmentationIndex/occupation_analysis_with_fragmentationIndex_def4.csv")
occupation_analysis = occupation_analysis.merge(merged_data, on="O*NET-SOC Code", how="left")

# Create indicators for above vs. below median fragmentation index, ai_fraction, human_E1_fraction, human_aiExposure_fraction
occupation_analysis['fragmentation_index_above_median'] = (occupation_analysis['fragmentation_index'] > occupation_analysis['fragmentation_index'].median()).astype(int)
occupation_analysis['ai_fraction_above_median'] = (occupation_analysis['ai_fraction'] > occupation_analysis['ai_fraction'].median()).astype(int)
occupation_analysis['human_E1_fraction_above_median'] = (occupation_analysis['human_E1_fraction'] > occupation_analysis['human_E1_fraction'].median()).astype(int)
occupation_analysis['human_aiExposure_fraction_above_median'] = (occupation_analysis['human_aiExposure_fraction'] > occupation_analysis['human_aiExposure_fraction'].median()).astype(int)
# occupation_analysis.head()

In [7]:
import matplotlib.pyplot as plt

summary = (
    kendall_results#[kendall_results['prompt_1'] == 0]
    .groupby("Occupation Title")["kendall_tau"]
    .agg(["mean", "min", "max"])
    .reset_index()
    .sort_values("mean")
)

# Merge with occupation_analysis to get above vs below median stats
summary = summary.merge(occupation_analysis[["O*NET-SOC Code", "Occupation Title", 
                                             'fragmentation_index_above_median', 'ai_fraction_above_median', 
                                             'human_E1_fraction_above_median', 'human_aiExposure_fraction_above_median']], left_on="Occupation Title", right_on="Occupation Title", how="left")
summary.head()

Unnamed: 0,Occupation Title,mean,min,max,O*NET-SOC Code,fragmentation_index_above_median,ai_fraction_above_median,human_E1_fraction_above_median,human_aiExposure_fraction_above_median
0,First-Line Supervisors of Non-Retail Sales Wor...,0.08,-0.67,0.75,41-1012.00,0.0,0.0,1.0,1.0
1,Biofuels Production Managers,0.12,-0.47,1.0,11-3051.03,1.0,0.0,0.0,0.0
2,Financial Managers,0.13,-0.47,0.73,11-3031.00,0.0,1.0,0.0,1.0
3,"Transportation, Storage, and Distribution Mana...",0.13,-0.42,0.74,11-3071.00,0.0,1.0,1.0,1.0
4,Funeral Home Managers,0.14,-0.54,0.9,11-9171.00,0.0,1.0,1.0,1.0


## Plot random 75 occupations and distribution of full + above/below median subsets

In [8]:
# Randomly subset 75 occupations for plotting
summary_subset = summary.sample(n=75, random_state=42).sort_values("mean")

vars_and_titles = {
    "fragmentation_index_above_median": "Robustness to GPT Prompts by Empirical Fragmentation Index (Definition 2)",
    "ai_fraction_above_median": "Robustness to GPT Prompts by Share of AI-Executed Occupation Tasks",
    "human_E1_fraction_above_median": "Robustness to GPT Prompts by Share of AI-Exposed (E1) Occupation Tasks",
    # "human_aiExposure_fraction_above_median": "Robustness to GPT Prompts by Share of AI-Exposed Occupation Tasks (E1 and E2)",
}

for var, title in vars_and_titles.items():
    plt.figure(figsize=(12, 16))

    # gray ranges
    plt.hlines(
        y=summary_subset["Occupation Title"],
        xmin=summary_subset["min"],
        xmax=summary_subset["max"],
        color="lightgray",
        zorder=1
    )

    # masks
    mask_red = summary_subset[var] == 1
    mask_blue = ~mask_red

    # shares
    n_total = len(summary_subset)
    share_red = mask_red.sum() / n_total
    share_blue = mask_blue.sum() / n_total

    label_blue = f"Below median ({share_blue:.1%})"
    label_red = f"Above median ({share_red:.1%})"

    # scatter
    plt.scatter(
        summary_subset.loc[mask_blue, "mean"],
        summary_subset.loc[mask_blue, "Occupation Title"],
        color="steelblue",
        label=label_blue,
        zorder=2
    )

    plt.scatter(
        summary_subset.loc[mask_red, "mean"],
        summary_subset.loc[mask_red, "Occupation Title"],
        color="red",
        label=label_red,
        zorder=3
    )

    # group means
    mean_blue = summary_subset.loc[mask_blue, "mean"].mean()
    mean_red = summary_subset.loc[mask_red, "mean"].mean()

    plt.axvline(
        mean_blue,
        color="steelblue",
        linestyle=":",
        linewidth=2,
        label=f"Below median mean = {mean_blue:.2f}"
    )
    plt.axvline(
        mean_red,
        color="red",
        linestyle=":",
        linewidth=2,
        label=f"Above median mean = {mean_red:.2f}"
    )

    # zero line
    plt.axvline(0, color="black", linestyle="--", linewidth=1)

    plt.xlabel("Kendall τ across GPT Prompt Formulations", fontsize=14)
    # plt.title(title)
    plt.legend(loc="upper left", fontsize=12)
    plt.tight_layout()
    plt.subplots_adjust(left=0.45)
    plt.savefig(
        f"{output_plot_path}/GPT_task_sequence_robustness_by_{var[:-13]}.png",
        dpi=300,
        bbox_inches="tight"
    )
    plt.close()

In [9]:
# Plot main distribution
plt.figure(figsize=(8, 6))
plt.hist(summary["mean"], bins=20, edgecolor='black')
plt.axvline(summary["mean"].mean(), color="red", linestyle="--", linewidth=2, label=f"Full Sample Mean = {summary['mean'].mean():.2f} (n={len(summary)})")
plt.xlabel("Mean Kendall τ (within Occupation)", fontsize=16)
plt.ylabel("Number of Occupations", fontsize=16)
plt.xlim(-1.02, 1.02)
plt.axvline(0, color="black", linestyle="--", linewidth=1)
plt.legend(loc="upper left", fontsize=15)
plt.tight_layout()
plt.savefig(f"{output_plot_path}/GPT_task_sequence_robustness_distribution.png", dpi=300)
plt.close()

In [10]:
# Plot distribution by above/below median for different metrics (from main data)
vars_and_labels = {
    "fragmentation_index_above_median": "EFI (Definition 2)",
    "human_E1_fraction_above_median": "AI Exposure",
    "ai_fraction_above_median": "AI Execution"
}

for var, label in vars_and_labels.items():
    fig, ax = plt.subplots(figsize=(8, 6))
    
    # Filter out NaN values
    summary_clean = summary.dropna(subset=[var])
    
    # Split by above/below median
    below_median = summary_clean[summary_clean[var] == 0]['mean']
    above_median = summary_clean[summary_clean[var] == 1]['mean']
    
    # Calculate means
    mean_below = below_median.mean()
    mean_above = above_median.mean()
    
    # Plot histograms
    ax.hist(below_median, bins=20, alpha=0.6, color='steelblue', edgecolor='black', 
            label=f'Below Median {label}\n(n={len(below_median)}, mean={mean_below:.2f})')
    ax.hist(above_median, bins=20, alpha=0.6, color='red', edgecolor='black',
            label=f'Above Median {label}\n(n={len(above_median)}, mean={mean_above:.2f})')
    
    # Add vertical lines for means
    ax.axvline(mean_below, color='steelblue', linestyle='--', linewidth=2)
    ax.axvline(mean_above, color='red', linestyle='--', linewidth=2)
    
    # Add zero reference line
    ax.axvline(0, color='black', linestyle='--', linewidth=1)
    
    ax.set_xlabel("Mean Kendall τ (within Occupation)", fontsize=16)
    ax.set_ylabel("Number of Occupations", fontsize=16)
    ax.set_xlim(-1.02, 1.02)
    ax.legend(loc="upper left", fontsize=15)
    plt.tight_layout()
    plt.savefig(f"{output_plot_path}/GPT_task_sequence_robustness_by_{var[:-13]}_distribution.png", dpi=300)
    plt.close()