#### By: Peyman Shahidi
#### Created: Jan 30, 2026
#### Last Edit: Jan 30, 2026

<br>

In [1]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [2]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT'

In [3]:
# Create directories if they don't exist
import os

for path in [output_data_path]:
    if not os.path.exists(path):
        os.makedirs(path)

# 1) Read O*NET Dataset

In [4]:
# Read O*NET data
ONET = pd.read_csv(f'{input_data_path}/computed_objects/ONET_cleaned_tasks.csv')

# Drop  columns to avoid double counting
# Note: In ~4k instances, the same task is mapped to multiple DWAs
ONET = ONET.drop(columns=['DWA ID', 'DWA Title'])

# Remove duplicate rows
rows_before = len(ONET)
print(f"Number of rows before removing duplicates: {rows_before:,}")
ONET = ONET.drop_duplicates().reset_index(drop=True)
rows_after = len(ONET)
print(f"Number of rows after removing duplicates: {rows_after:,}")
print(f"Duplicates removed: {rows_before - rows_after}")

# Print length of dataset
print(f"Number of rows in ONET dataset: {len(ONET):,}")

Number of rows before removing duplicates: 22,310
Number of rows after removing duplicates: 17,953
Duplicates removed: 4357
Number of rows in ONET dataset: 17,953


# 2) Read and Merge with Eloundou et al.'s AI Exposure Dataset

In [5]:
# Load GPTs are GPTs full label dataset
gpts_full_labels = pd.read_csv(f'{input_data_path}/GPTs-are-GPTs-main/data/full_labelset.tsv', sep="\t")

# Keep relevant columns only
gpts_full_labels = gpts_full_labels[['O*NET-SOC Code', 'Task ID', 'Task', 'Task Type', 'Title', 'gpt4_exposure', 'human_labels']]

# Convert Task ID to integer
gpts_full_labels['Task ID'] = gpts_full_labels['Task ID'].astype(int)

# Remove apostrophes for consistency
gpts_full_labels = gpts_full_labels.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)

# Rename columns
gpts_full_labels = gpts_full_labels.rename(columns={
    'Task': 'Task Title',
    'Title': 'Occupation Title'
})

# Print length of dataset
print(f"Number of rows in GPTs full labels dataset: {len(gpts_full_labels):,}")

Number of rows in GPTs full labels dataset: 19,265


In [6]:
# Merge with ONET dataset to get hierarchical codes and titles
ONET = ONET.merge(gpts_full_labels, on=['O*NET-SOC Code', 'Occupation Title', 'Task ID', 'Task Title', 'Task Type'], how='left')

# Check how many tasks were not matched
unmatched_tasks = ONET[ONET['gpt4_exposure'].isna()]
print(f"Number of unmatched tasks: {len(unmatched_tasks):,}")

Number of unmatched tasks: 791


# 3) Read and Merge with Anthropic's AI Use Dataset

In [7]:
# Merge with Anthropic exposure data
anthropic_exposure = pd.read_csv(f'{input_data_path}/Anthropic_EconomicIndex/automation_vs_augmentation_by_task.csv')

# Remove if all entries are filtered:
print(f"Number of rows before filtering: {len(anthropic_exposure):,}")
anthropic_exposure = anthropic_exposure[anthropic_exposure['filtered'] != 1].reset_index(drop=True)
print(f"Number of rows after filtering: {len(anthropic_exposure):,}")

# Create new columns:
# Sum feedback loop and directive into Automation
# Sum validation, iteration, and learning into Augmentation
anthropic_exposure['automation'] = anthropic_exposure.apply(lambda row: row['feedback_loop'] + row['directive'], axis=1)
anthropic_exposure['augmentation'] = anthropic_exposure.apply(lambda row: row['validation'] + row['task_iteration'] + row['learning'], axis=1)

# Assign labels: take the max of automation, augmentation, manual and assign the corresponding label
def assign_label(row):
    max_value = max(row['automation'], row['augmentation'])
    if max_value == row['automation']:
        return 'Automation'
    elif max_value == row['augmentation']:
        return 'Augmentation'

anthropic_exposure['label'] = anthropic_exposure.apply(assign_label, axis=1)

# Filter to only keep the relevant columns
anthropic_exposure = anthropic_exposure[['task_name', 'automation', 'augmentation', 'label']]

Number of rows before filtering: 3,364
Number of rows after filtering: 2,298


In [8]:
# Print number of unique tasks in ONET dataset
print(f"Number of unique tasks in ONET dataset: {ONET['Task Title'].nunique():,}")

# Print number of unique tasks in Anthropic exposure dataset
print(f"Number of unique tasks in Anthropic exposure dataset: {anthropic_exposure['task_name'].nunique():,}")


# Add normalized task title to ONET for merging
ONET["task_normalized"] = ONET["Task Title"].str.lower().str.strip()


# Merge ONET and Anthropic exposure data on the normalized task title
merged_data = pd.merge(ONET, anthropic_exposure[['task_name', 'automation', 'augmentation', 'label']], left_on="task_normalized", right_on="task_name", how="left")

# Fill the NaN values of the label column in merged dataset as Manual
merged_data['label'] = merged_data['label'].fillna('Manual')

# Print distribution after filling NaN values
print(f"\nDistribution of labels after filling NaN values with 'Manual':")
print(merged_data['label'].value_counts())
print(f"Total tasks: {len(merged_data):,}")

# Drop temporary columns used for merging
merged_data = merged_data.drop(columns=['task_normalized', 'task_name'])

Number of unique tasks in ONET dataset: 16,913
Number of unique tasks in Anthropic exposure dataset: 2,298

Distribution of labels after filling NaN values with 'Manual':
label
Manual          15605
Augmentation     1626
Automation        722
Name: count, dtype: int64
Total tasks: 17,953


# 4) Read and Merge with GPT 5-o Mini's Task Sequence Data *from robustness prompts*

In [9]:
for x in range(11):
    prompt_dir = f"{input_data_path}/computed_objects/tasks_sequences_robustness_restructured/prompt_{x}"
    task_sequence_files = [f for f in os.listdir(prompt_dir) if f.endswith(".csv")]

    if not task_sequence_files:
        print(f"[prompt_{x}] No CSV files found in {prompt_dir}; skipping.")
        continue

    task_sequence = pd.concat(
        [pd.read_csv(os.path.join(prompt_dir, file)) for file in task_sequence_files],
        ignore_index=True
    )

    print(f"[prompt_{x}] Task sequence data shape: {task_sequence.shape}")
    # print(f"[prompt_{x}] Columns in task sequence: {list(task_sequence.columns)}")

    # Work on a fresh copy each iteration (avoid accumulating merges across prompts)
    md = merged_data.copy()

    # Merge task sequence data with merged_data
    md = pd.merge(
        md,
        task_sequence[["O*NET-SOC Code", "Task ID", "Task Position"]],
        on=["O*NET-SOC Code", "Task ID"],
        how="left"
    )
    md = md[md["Task Position"].notna()].reset_index(drop=True)
    md["Task Position"] = pd.to_numeric(md["Task Position"], errors="coerce").astype("Int64")

    # Bring the "Task Position" column next to "Task Title" (as in your code)
    task_position_col = md.pop("Task Position")
    md.insert(md.columns.get_loc("Task Title") + 1, "Task Position", task_position_col)

    # Sort values before saving
    md = md.sort_values(by=["O*NET-SOC Code", "Task Position"])

    # Save merged dataset
    out_file = os.path.join(output_data_path, f"ONET_Eloundou_Anthropic_GPT_{x}.csv")
    md.to_csv(out_file, index=False)
    print(f"[prompt_{x}] Saved: {out_file}\n")

[prompt_0] Task sequence data shape: (17868, 5)
[prompt_0] Saved: ../data/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT_0.csv

[prompt_1] Task sequence data shape: (17828, 5)
[prompt_1] Saved: ../data/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT_1.csv

[prompt_2] Task sequence data shape: (17866, 5)
[prompt_2] Saved: ../data/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT_2.csv

[prompt_3] Task sequence data shape: (17826, 5)
[prompt_3] Saved: ../data/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT_3.csv

[prompt_4] Task sequence data shape: (17857, 5)
[prompt_4] Saved: ../data/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT_4.csv

[prompt_5] Task sequence data shape: (17847, 5)
[prompt_5] Saved: ../data/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT_5.csv

[prompt_6] Task sequence data shape: (17810, 5)
[prompt_6] Saved: ../data/co