#### By: Peyman Shahidi
#### Created: Oct 9, 2025

<br>

In [1]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [2]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects'
output_plot_path = f"{main_folder_path}/writeup/plots/taskcounts_aiexposure"

In [3]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [4]:
# Read O*NET data
ONET = pd.read_csv(f'{input_data_path}/computed_objects/ONET_cleaned_tasks.csv')

# Remove apostrophes for consistency
ONET = ONET.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)

# Drop DWA columns to avoid double counting
# Note: In ~4k instances, the same task is mapped to multiple DWAs
ONET = ONET.drop(columns=['DWA ID', 'DWA Title'])

# Remove duplicate rows
rows_before = len(ONET)
print(f"Number of rows before removing duplicates: {rows_before:,}")
ONET = ONET.drop_duplicates().reset_index(drop=True)
rows_after = len(ONET)
print(f"Number of rows after removing duplicates: {rows_after:,}")
print(f"Duplicates removed: {rows_before - rows_after}")

# Print length of dataset
print(f"Number of rows in ONET dataset: {len(ONET):,}")

ONET.head(5)

Number of rows before removing duplicates: 22,310
Number of rows after removing duplicates: 17,953
Duplicates removed: 4357
Number of rows in ONET dataset: 17,953


Unnamed: 0,O*NET-SOC Code,Occupation Title,Task ID,Task Title,Task Type,Job Zone,FT_Daily,FT_Hourly or more,FT_More than monthly,FT_More than weekly,...,Importance,Relevance,Major_Group_Code,Major_Group_Title,Minor_Group_Code,Minor_Group_Title,Broad_Occupation_Code,Broad_Occupation_Title,Detailed_Occupation_Code,Detailed_Occupation_Title
0,11-1011.00,Chief Executives,8823,Direct or coordinate an organizations financia...,Core,5,46.67,5.26,11.04,16.19,...,4.54,94.19,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
1,11-1011.00,Chief Executives,8824,"Confer with board members, organization offici...",Core,5,25.27,4.81,27.41,15.58,...,4.15,98.79,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
2,11-1011.00,Chief Executives,8825,Analyze operations to evaluate performance of ...,Core,5,35.11,3.73,12.61,18.96,...,4.4,100.0,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
3,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",Core,5,38.47,6.38,10.18,23.83,...,4.39,95.84,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
4,11-1011.00,Chief Executives,8827,"Prepare budgets for approval, including those ...",Core,5,7.87,0.65,18.43,10.23,...,4.17,90.47,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives


In [5]:
# Load GPTs are GPTs full label dataset
gpts_full_labels = pd.read_csv(f'{input_data_path}/GPTs-are-GPTs-main/data/full_labelset.tsv', sep="\t")
# Drop the first column
gpts_full_labels = gpts_full_labels.drop(gpts_full_labels.columns[:1], axis=1)

# Convert Task ID to integer
gpts_full_labels['Task ID'] = gpts_full_labels['Task ID'].astype(int)

# Remove apostrophes for consistency
gpts_full_labels = gpts_full_labels.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)

# Rename columns
gpts_full_labels = gpts_full_labels.rename(columns={
    'Task': 'Task Title',
    'Title': 'Occupation Title'
})

# Print length of dataset
print(f"Number of rows in GPTs full labels dataset: {len(gpts_full_labels):,}")


gpts_full_labels.head(5)

Number of rows in GPTs full labels dataset: 19,265


Unnamed: 0,O*NET-SOC Code,Task ID,Task Title,Task Type,Occupation Title,human_exposure_agg,gpt4_exposure,gpt4_exposure_alt_rubric,gpt_3_relevant,gpt4_automation,alpha,beta,gamma,automation,human_labels
0,11-1011.00,8823,Direct or coordinate an organizations financia...,Core,Chief Executives,E0,E2,E2,False,T2,0.0,0.5,1.0,0.5,E0
1,11-1011.00,8831,Appoint department heads or managers and assig...,Core,Chief Executives,E0,E0,E0,False,T1,0.0,0.0,0.0,0.25,E0
2,11-1011.00,8825,Analyze operations to evaluate performance of ...,Core,Chief Executives,E2,E2,E2,False,T2,0.0,0.5,1.0,0.5,E2
3,11-1011.00,8826,"Direct, plan, or implement policies, objective...",Core,Chief Executives,E0,E2,E0,False,T1,0.0,0.5,1.0,0.25,E0
4,11-1011.00,8827,"Prepare budgets for approval, including those ...",Core,Chief Executives,E2,E2,E2,False,T2,0.0,0.5,1.0,0.5,E2


In [6]:
# Merge with ONET dataset to get hierarchical codes and titles
ONET = ONET.merge(gpts_full_labels, on=['O*NET-SOC Code', 'Occupation Title', 'Task ID', 'Task Title', 'Task Type'], how='left')
ONET

# Check how many tasks were not matched
unmatched_tasks = ONET[ONET['gpt4_exposure'].isna()]
print(f"Number of unmatched tasks: {len(unmatched_tasks):,}")

Number of unmatched tasks: 0


In [7]:
# Reconstruct the occ_level exposure dataset from the original paper
def compute_exposure_shares(df, exposure_col, prefix):
    # Weight core vs. supplemental tasks (if column exists)
    df["task_weight"] = df["Task Type"].map({"Core": 1.0, "Supplemental": 0.5}).fillna(1.0)

    # One-hot exposure indicators
    df[f"{prefix}_E1"] = (df[exposure_col] == "E1").astype(int)
    df[f"{prefix}_E2"] = (df[exposure_col] == "E2").astype(int)

    # Weighted shares per occupation
    occ_level = (
        df.groupby(["O*NET-SOC Code", "Occupation Title"])
        .apply(lambda x: pd.Series({
            f"{prefix}_E1_share": (x[f"{prefix}_E1"] * x["task_weight"]).sum() / x["task_weight"].sum(),
            f"{prefix}_E2_share": (x[f"{prefix}_E2"] * x["task_weight"]).sum() / x["task_weight"].sum(),
        }))
        .reset_index()
    )

    # Composite measures
    occ_level[f"{prefix}_E1_plus_halfE2"] = occ_level[f"{prefix}_E1_share"] + 0.5 * occ_level[f"{prefix}_E2_share"]
    occ_level[f"{prefix}_E1_plus_E2"]     = occ_level[f"{prefix}_E1_share"] + occ_level[f"{prefix}_E2_share"]

    occ_level = occ_level.drop(columns=[f'{prefix}_E2_share'])

    occ_level = occ_level.rename(columns={'gpt4_E1_share': 'gpt4_alpha',
                                          'gpt4_E1_plus_halfE2': 'gpt4_beta',
                                          'gpt4_E1_plus_E2': 'gpt4_gamma',
                                          'human_E1_share': 'human_alpha',
                                          'human_E1_plus_halfE2': 'human_beta',
                                          'human_E1_plus_E2': 'human_gamma'})

    return occ_level


# Compute exposure shares for GPT-4 and human ratings
gpt4_exposure_scores = compute_exposure_shares(ONET, "gpt4_exposure", "gpt4")
human_exposure_scores = compute_exposure_shares(ONET, "human_labels", "human")

# Merge GPT-4 and human exposure scores on occupation code and title
exposure_scores_occupation = pd.merge(gpt4_exposure_scores, human_exposure_scores, 
                                     on=["O*NET-SOC Code", "Occupation Title"])

# Save the combined exposure scores to CSV
# exposure_scores_occupation.to_csv(f'{output_data_path}/exposure_scores_occupation.csv', index=False)
# print(f"Saved combined exposure scores to: {output_data_path}/exposure_scores_occupation.csv")
print(f"Number of occupations: {len(exposure_scores_occupation)}")

# Display the results
exposure_scores_occupation

Number of occupations: 873


Unnamed: 0,O*NET-SOC Code,Occupation Title,gpt4_alpha,gpt4_beta,gpt4_gamma,human_alpha,human_beta,human_gamma
0,11-1011.00,Chief Executives,0.10,0.46,0.82,0.18,0.35,0.52
1,11-1011.03,Chief Sustainability Officers,0.17,0.56,0.94,0.06,0.39,0.72
2,11-1021.00,General and Operations Managers,0.00,0.48,0.96,0.12,0.38,0.65
3,11-2011.00,Advertising and Promotions Managers,0.00,0.49,0.97,0.32,0.57,0.82
4,11-2021.00,Marketing Managers,0.06,0.50,0.94,0.22,0.58,0.94
...,...,...,...,...,...,...,...,...
868,53-7071.00,Gas Compressor and Gas Pumping Station Operators,0.23,0.27,0.31,0.00,0.08,0.15
869,53-7072.00,"Pump Operators, Except Wellhead Pumpers",0.17,0.25,0.33,0.00,0.17,0.33
870,53-7073.00,Wellhead Pumpers,0.00,0.00,0.00,0.00,0.00,0.00
871,53-7081.00,Refuse and Recyclable Material Collectors,0.18,0.20,0.21,0.11,0.23,0.36


In [8]:
# Function to compute hierarchical exposure scores
def compute_hierarchical_exposure(df, code_col, title_col, exposure_col, prefix):
    """
    Compute exposure shares at a hierarchical level (e.g., Major Group, Minor Group).
    
    Parameters:
    - df: DataFrame with task-level data
    - code_col: Column name for the hierarchical code
    - title_col: Column name for the hierarchical title
    - exposure_col: Column with exposure ratings (e.g., 'gpt4_exposure')
    - prefix: Prefix for output columns (e.g., 'gpt4')
    
    Returns:
    - DataFrame with exposure measures at the hierarchical level
    """
    # Create a copy to avoid modifying original
    df_copy = df.copy()
    
    # Weight core vs. supplemental tasks
    df_copy["task_weight"] = df_copy["Task Type"].map({"Core": 1.0, "Supplemental": 0.5}).fillna(1.0)
    
    # One-hot exposure indicators
    df_copy[f"{prefix}_E1"] = (df_copy[exposure_col] == "E1").astype(int)
    df_copy[f"{prefix}_E2"] = (df_copy[exposure_col] == "E2").astype(int)
    
    # Filter out rows with NaN exposure values
    df_copy = df_copy[df_copy[exposure_col].notna()]
    
    # Weighted shares per hierarchical level
    level_exposure = (
        df_copy.groupby([code_col, title_col], as_index=False)
        .apply(lambda x: pd.Series({
            f"{prefix}_E1_share": (x[f"{prefix}_E1"] * x["task_weight"]).sum() / x["task_weight"].sum(),
            f"{prefix}_E2_share": (x[f"{prefix}_E2"] * x["task_weight"]).sum() / x["task_weight"].sum(),
        }), include_groups=False)
        .reset_index()
    )
    
    # Composite measures
    level_exposure[f"{prefix}_alpha"] = level_exposure[f"{prefix}_E1_share"]
    level_exposure[f"{prefix}_beta"] = level_exposure[f"{prefix}_E1_share"] + 0.5 * level_exposure[f"{prefix}_E2_share"]
    level_exposure[f"{prefix}_gamma"] = level_exposure[f"{prefix}_E1_share"] + level_exposure[f"{prefix}_E2_share"]
    
    # Drop E2_share and E1_share as they're not needed in final output
    level_exposure = level_exposure.drop(columns=[f'{prefix}_E2_share', f'{prefix}_E1_share'])
    
    # Rename to standard column names
    level_exposure = level_exposure.rename(columns={
        code_col: 'Code',
        title_col: 'Title'
    })
    
    return level_exposure


# Define hierarchical levels
hierarchical_levels = [
    ('Major_Group_Code', 'Major_Group_Title', 'Major Group'),
    ('Minor_Group_Code', 'Minor_Group_Title', 'Minor Group'),
    ('Broad_Occupation_Code', 'Broad_Occupation_Title', 'Broad Occupation'),
    ('Detailed_Occupation_Code', 'Detailed_Occupation_Title', 'Detailed Occupation')
]

# Compute exposure scores for each hierarchical level
for code_col, title_col, level_name in hierarchical_levels:
    # Compute GPT-4 exposure
    gpt4_exp = compute_hierarchical_exposure(ONET, code_col, title_col, 'gpt4_exposure', 'gpt4')
    
    # Compute human exposure
    human_exp = compute_hierarchical_exposure(ONET, code_col, title_col, 'human_labels', 'human')

    # Merge GPT-4 and human exposure scores
    combined_exp = pd.merge(gpt4_exp, human_exp, on=['Code', 'Title'])
    combined_exp['Level'] = level_name
    
    # Reorder columns
    combined_exp = combined_exp[['Level', 'Code', 'Title', 
                                  'gpt4_alpha', 'gpt4_beta', 'gpt4_gamma',
                                  'human_alpha', 'human_beta', 'human_gamma']]
    
    # Save to CSV
    filename = level_name.lower().replace(' ', '_')
    # combined_exp.to_csv(f'{output_data_path}/exposure_scores_{filename}.csv', index=False)
    # print(f"Saved {level_name} exposure scores: {len(combined_exp)} groups")


In [9]:
# Create a combined hierarchical exposure dataset (all 5 levels in one file)
all_exposure_levels = []

# Add hierarchical-level exposure scores in order from most aggregated to most granular
for code_col, title_col, level_name in hierarchical_levels:
    gpt4_exp = compute_hierarchical_exposure(ONET, code_col, title_col, 'gpt4_exposure', 'gpt4')
    human_exp = compute_hierarchical_exposure(ONET, code_col, title_col, 'human_labels', 'human')
    
    combined_exp = pd.merge(gpt4_exp, human_exp, on=['Code', 'Title'])
    combined_exp['Level'] = level_name
    combined_exp = combined_exp[['Level', 'Code', 'Title',
                                  'gpt4_alpha', 'gpt4_beta', 'gpt4_gamma',
                                  'human_alpha', 'human_beta', 'human_gamma']]
    
    all_exposure_levels.append(combined_exp)

# Add occupation-level exposure scores (most granular)
occupation_exposure = exposure_scores_occupation.copy()
occupation_exposure['Level'] = 'Occupation'
occupation_exposure = occupation_exposure.rename(columns={'O*NET-SOC Code': 'Code', 'Occupation Title': 'Title'})
occupation_exposure = occupation_exposure[['Level', 'Code', 'Title',
                                           'gpt4_alpha', 'gpt4_beta', 'gpt4_gamma',
                                           'human_alpha', 'human_beta', 'human_gamma']]
all_exposure_levels.append(occupation_exposure)

# Combine all 5 levels
exposure_scores_all_levels = pd.concat(all_exposure_levels, ignore_index=True)

# Save combined dataset
# exposure_scores_all_levels.to_csv(f'{output_data_path}/exposure_scores_all_levels.csv', index=False)

print(f"\nCombined Exposure Dataset (All 5 Levels):")
print(f"  Total rows: {len(exposure_scores_all_levels):,}")
print(f"\nBreakdown by level:")
for level_name in ['Major Group', 'Minor Group', 'Broad Occupation', 'Detailed Occupation', 'Occupation']:
    count = len(exposure_scores_all_levels[exposure_scores_all_levels['Level'] == level_name])
    print(f"  {level_name}: {count}")

exposure_scores_all_levels.head(5)


Combined Exposure Dataset (All 5 Levels):
  Total rows: 2,179

Breakdown by level:
  Major Group: 22
  Minor Group: 95
  Broad Occupation: 430
  Detailed Occupation: 759
  Occupation: 873


Unnamed: 0,Level,Code,Title,gpt4_alpha,gpt4_beta,gpt4_gamma,human_alpha,human_beta,human_gamma
0,Major Group,11-0000,Management Occupations,0.07,0.47,0.86,0.19,0.44,0.68
1,Major Group,13-0000,Business and Financial Operations Occupations,0.15,0.54,0.92,0.27,0.53,0.8
2,Major Group,15-0000,Computer and Mathematical Occupations,0.5,0.73,0.97,0.29,0.6,0.91
3,Major Group,17-0000,Architecture and Engineering Occupations,0.17,0.5,0.83,0.2,0.43,0.67
4,Major Group,19-0000,"Life, Physical, and Social Science Occupations",0.12,0.46,0.81,0.28,0.5,0.72


In [10]:
# Merge Task and DWA counts with exposure scores
hierarchical_task_counts = pd.read_csv(f'{output_data_path}/hierarchical_task_counts.csv')

# Merge on Level, Code, and Title
exposure_scores_with_counts = exposure_scores_all_levels.merge(
    hierarchical_task_counts[['Level', 'Code', 'Title', 'num_unique_tasks', 'num_unique_dwas']],
    on=['Level', 'Code', 'Title'], 
    how='left'
)

# Check merge statistics
print(f"\nMerge Statistics:")
print(f"  Rows before merge (exposure scores): {len(exposure_scores_all_levels):,}")
print(f"  Rows in task counts dataset: {len(hierarchical_task_counts):,}")
print(f"  Rows after merge: {len(exposure_scores_with_counts):,}")
print(f"  Rows with missing task counts: {exposure_scores_with_counts['num_unique_tasks'].isna().sum():,}")
print(f"  Rows with missing DWA counts: {exposure_scores_with_counts['num_unique_dwas'].isna().sum():,}")

# Show rows with missing values
if exposure_scores_with_counts['num_unique_tasks'].isna().sum() > 0:
    print(f"\nRows with missing task/DWA counts:")
    missing_rows = exposure_scores_with_counts[exposure_scores_with_counts['num_unique_tasks'].isna()]
    print(missing_rows[['Level', 'Code', 'Title']])
    
    # Drop rows with missing task counts
    exposure_scores_with_counts = exposure_scores_with_counts.dropna(subset=['num_unique_tasks', 'num_unique_dwas'])
    print(f"\nAfter dropping rows with missing counts: {len(exposure_scores_with_counts):,} rows remaining")
else:
    print(f"\n✓ All rows successfully merged!")

# Convert count columns to integers
exposure_scores_with_counts['num_unique_tasks'] = exposure_scores_with_counts['num_unique_tasks'].astype(int)
exposure_scores_with_counts['num_unique_dwas'] = exposure_scores_with_counts['num_unique_dwas'].astype(int)

# Save the combined dataset
exposure_scores_with_counts.to_csv(f'{output_data_path}/exposure_scores_with_counts.csv', index=False)

print(f"\nCombined Exposure + Task Counts Dataset:")
print(f"  Total rows: {len(exposure_scores_with_counts):,}")
print(f"\nFirst 10 rows:")
exposure_scores_with_counts.head(10)


Merge Statistics:
  Rows before merge (exposure scores): 2,179
  Rows in task counts dataset: 2,179
  Rows after merge: 2,179
  Rows with missing task counts: 2
  Rows with missing DWA counts: 2

Rows with missing task/DWA counts:
                    Level        Code                                Title
911   Detailed Occupation     33-3051  Police and Sheriffs Patrol Officers
1766           Occupation  33-3051.00  Police and Sheriffs Patrol Officers

After dropping rows with missing counts: 2,177 rows remaining

Combined Exposure + Task Counts Dataset:
  Total rows: 2,177

First 10 rows:


Unnamed: 0,Level,Code,Title,gpt4_alpha,gpt4_beta,gpt4_gamma,human_alpha,human_beta,human_gamma,num_unique_tasks,num_unique_dwas
0,Major Group,11-0000,Management Occupations,0.07,0.47,0.86,0.19,0.44,0.68,1063,188
1,Major Group,13-0000,Business and Financial Operations Occupations,0.15,0.54,0.92,0.27,0.53,0.8,871,167
2,Major Group,15-0000,Computer and Mathematical Occupations,0.5,0.73,0.97,0.29,0.6,0.91,606,118
3,Major Group,17-0000,Architecture and Engineering Occupations,0.17,0.5,0.83,0.2,0.43,0.67,1157,172
4,Major Group,19-0000,"Life, Physical, and Social Science Occupations",0.12,0.46,0.81,0.28,0.5,0.72,1168,227
5,Major Group,21-0000,Community and Social Service Occupations,0.11,0.32,0.53,0.22,0.35,0.49,296,69
6,Major Group,23-0000,Legal Occupations,0.1,0.49,0.87,0.1,0.41,0.72,125,26
7,Major Group,25-0000,Educational Instruction and Library Occupations,0.16,0.43,0.69,0.22,0.42,0.62,1494,124
8,Major Group,27-0000,"Arts, Design, Entertainment, Sports, and Media...",0.2,0.48,0.76,0.2,0.38,0.56,764,123
9,Major Group,29-0000,Healthcare Practitioners and Technical Occupat...,0.08,0.34,0.6,0.13,0.29,0.45,1541,198


In [11]:
# Combined plots: Create three figures (alpha, beta, gamma) with both Tasks and DWAs 
# in a single comprehensive layout for each exposure measure

# Define the levels in order from most aggregated to most granular
levels_to_plot = ['Major Group', 'Minor Group', 'Broad Occupation', 'Detailed Occupation', 'Occupation']

# Define exposure measures to use
exposure_measures = ['alpha', 'beta', 'gamma']

# Create one comprehensive plot for each exposure measure
for exposure_measure in exposure_measures:
    # Create figure with 5x2 subplots (5 levels × 2 exposure types)
    fig, axes = plt.subplots(5, 2, figsize=(18, 22))
    
    fig.suptitle(f'Tasks and DWAs vs {exposure_measure.capitalize()} Exposure Across Hierarchical Levels\n\n\n', 
                fontsize=18, fontweight='bold', y=0.995)
    
    # Calculate global x-axis limits for this exposure measure to ensure consistent scaling
    all_gpt4_values = []
    all_human_values = []
    
    for level_name in levels_to_plot:
        level_data = exposure_scores_with_counts[exposure_scores_with_counts['Level'] == level_name].copy()
        gpt4_vals = level_data[f'gpt4_{exposure_measure}'].values
        human_vals = level_data[f'human_{exposure_measure}'].values
        
        # Remove NaN values for range calculation
        gpt4_clean = gpt4_vals[~np.isnan(gpt4_vals)]
        human_clean = human_vals[~np.isnan(human_vals)]
        
        if len(gpt4_clean) > 0:
            all_gpt4_values.extend(gpt4_clean)
        if len(human_clean) > 0:
            all_human_values.extend(human_clean)
    
    # Set consistent x-axis limits with margins for a clean 0-1 range
    x_margin = 0.05  # 5% margin on each side
    gpt4_xlim = (-x_margin, 1 + x_margin)
    human_xlim = (-x_margin, 1 + x_margin)
    
    # Loop through each hierarchical level
    for level_idx, level_name in enumerate(levels_to_plot):
        # Filter data for current level
        level_data = exposure_scores_with_counts[exposure_scores_with_counts['Level'] == level_name].copy()
        n_obs = len(level_data)
        
        # Left column: GPT-4 exposure
        ax_gpt4 = axes[level_idx, 0]
        
        # Get data for both tasks and DWAs
        x_gpt4 = level_data[f'gpt4_{exposure_measure}'].values
        y_tasks = level_data['num_unique_tasks'].values
        y_dwas = level_data['num_unique_dwas'].values
        
        # Plot tasks (blue circles)
        ax_gpt4.scatter(x_gpt4, y_tasks, alpha=0.7, color='steelblue', edgecolors='white', 
                       linewidth=0.5, s=60, label='Tasks')
        
        # Plot DWAs (red triangles)
        ax_gpt4.scatter(x_gpt4, y_dwas, alpha=0.7, color='indianred', edgecolors='white', 
                       linewidth=0.5, s=60, marker='^', label='DWAs')
        
        # Fit and plot regression lines
        if len(x_gpt4) > 1 and not np.all(np.isnan(x_gpt4)):
            # Tasks regression line
            valid_idx_tasks = ~(np.isnan(x_gpt4) | np.isnan(y_tasks))
            if np.sum(valid_idx_tasks) > 1:
                x_clean = x_gpt4[valid_idx_tasks]
                y_clean = y_tasks[valid_idx_tasks]
                slope_tasks, intercept_tasks = np.polyfit(x_clean, y_clean, 1)
                x_fit = np.linspace(x_clean.min(), x_clean.max(), 100)
                y_fit_tasks = slope_tasks * x_fit + intercept_tasks
                ax_gpt4.plot(x_fit, y_fit_tasks, color='steelblue', linewidth=2, alpha=0.8, 
                           linestyle='-', label=f'Tasks slope: {slope_tasks:.2f}')
            
            # DWAs regression line
            valid_idx_dwas = ~(np.isnan(x_gpt4) | np.isnan(y_dwas))
            if np.sum(valid_idx_dwas) > 1:
                x_clean = x_gpt4[valid_idx_dwas]
                y_clean = y_dwas[valid_idx_dwas]
                slope_dwas, intercept_dwas = np.polyfit(x_clean, y_clean, 1)
                x_fit = np.linspace(x_clean.min(), x_clean.max(), 100)
                y_fit_dwas = slope_dwas * x_fit + intercept_dwas
                ax_gpt4.plot(x_fit, y_fit_dwas, color='indianred', linewidth=2, alpha=0.8, 
                           linestyle='--', label=f'DWAs slope: {slope_dwas:.2f}')
        
        ax_gpt4.set_xlabel(f'GPT-4 {exposure_measure.capitalize()} Exposure', fontsize=12)
        ax_gpt4.set_ylabel('Count', fontsize=12)
        ax_gpt4.set_title(f'{level_name} (n={n_obs}): GPT-4 Exposure', fontsize=13, fontweight='bold')
        ax_gpt4.grid(True, alpha=0.3)
        ax_gpt4.legend(fontsize=10, loc='upper right')
        ax_gpt4.set_xlim(gpt4_xlim)  # Set consistent x-axis limits (0-1 with margins)
        
        # Right column: Human exposure
        ax_human = axes[level_idx, 1]
        
        # Get data for both tasks and DWAs
        x_human = level_data[f'human_{exposure_measure}'].values
        
        # Plot tasks (blue circles)
        ax_human.scatter(x_human, y_tasks, alpha=0.7, color='steelblue', edgecolors='white', 
                        linewidth=0.5, s=60, label='Tasks')
        
        # Plot DWAs (red triangles)
        ax_human.scatter(x_human, y_dwas, alpha=0.7, color='indianred', edgecolors='white', 
                        linewidth=0.5, s=60, marker='^', label='DWAs')
        
        # Fit and plot regression lines
        if len(x_human) > 1 and not np.all(np.isnan(x_human)):
            # Tasks regression line
            valid_idx_tasks = ~(np.isnan(x_human) | np.isnan(y_tasks))
            if np.sum(valid_idx_tasks) > 1:
                x_clean = x_human[valid_idx_tasks]
                y_clean = y_tasks[valid_idx_tasks]
                slope_tasks_h, intercept_tasks_h = np.polyfit(x_clean, y_clean, 1)
                x_fit_h = np.linspace(x_clean.min(), x_clean.max(), 100)
                y_fit_tasks_h = slope_tasks_h * x_fit_h + intercept_tasks_h
                ax_human.plot(x_fit_h, y_fit_tasks_h, color='steelblue', linewidth=2, alpha=0.8, 
                            linestyle='-', label=f'Tasks slope: {slope_tasks_h:.2f}')
            
            # DWAs regression line
            valid_idx_dwas = ~(np.isnan(x_human) | np.isnan(y_dwas))
            if np.sum(valid_idx_dwas) > 1:
                x_clean = x_human[valid_idx_dwas]
                y_clean = y_dwas[valid_idx_dwas]
                slope_dwas_h, intercept_dwas_h = np.polyfit(x_clean, y_clean, 1)
                x_fit_h = np.linspace(x_clean.min(), x_clean.max(), 100)
                y_fit_dwas_h = slope_dwas_h * x_fit_h + intercept_dwas_h
                ax_human.plot(x_fit_h, y_fit_dwas_h, color='indianred', linewidth=2, alpha=0.8, 
                            linestyle='--', label=f'DWAs slope: {slope_dwas_h:.2f}')
        
        ax_human.set_xlabel(f'Human {exposure_measure.capitalize()} Exposure', fontsize=12)
        ax_human.set_ylabel('Count', fontsize=12)
        ax_human.set_title(f'{level_name} (n={n_obs}): Human Exposure', fontsize=13, fontweight='bold')
        ax_human.grid(True, alpha=0.3)
        ax_human.legend(fontsize=10, loc='upper right')
        ax_human.set_xlim(human_xlim)  # Set consistent x-axis limits (0-1 with margins)
    
    # Add main column headers
    axes[0, 0].text(0.5, 1.12, 'GPT-4 Exposure', transform=axes[0, 0].transAxes, 
                   ha='center', va='bottom', fontsize=16, fontweight='bold')
    axes[0, 1].text(0.5, 1.12, 'Human Exposure', transform=axes[0, 1].transAxes, 
                   ha='center', va='bottom', fontsize=16, fontweight='bold')
    
    plt.tight_layout()
    plt.subplots_adjust(top=0.96, hspace=0.35, wspace=0.25)
    
    # Save figure
    filename = f'tasks_and_dwas_vs_exposure_{exposure_measure}.png'
    plt.savefig(f'{output_plot_path}/{filename}', dpi=300, bbox_inches='tight')
    plt.close()

print(f"✓ Saved {len(exposure_measures)} comprehensive plots to: {output_plot_path}/")

✓ Saved 3 comprehensive plots to: ../writeup/plots/taskcounts_aiexposure/


In [12]:
# Regression analysis: Number of tasks/DWAs on GPT-4 and Human exposure measures
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Initialize list to store regression results
regression_results = []

# Define the levels and exposure measures (same as in plots)
levels_to_analyze = ['Major Group', 'Minor Group', 'Broad Occupation', 'Detailed Occupation', 'Occupation']
exposure_measures = ['alpha', 'beta', 'gamma']
dependent_vars = ['num_unique_tasks', 'num_unique_dwas']

# Function to perform regression and extract statistics
def run_regression(x, y, x_name, y_name, level_name, exposure_measure):
    """Run regression and return coefficient, std error, t-stat, p-value, R²"""
    # Remove any NaN values
    valid_idx = ~(np.isnan(x) | np.isnan(y))
    x_clean = x[valid_idx]
    y_clean = y[valid_idx]
    
    if len(x_clean) < 2:
        return None
    
    # Fit regression
    reg = LinearRegression().fit(x_clean.reshape(-1, 1), y_clean)
    y_pred = reg.predict(x_clean.reshape(-1, 1))
    
    # Calculate statistics
    n = len(x_clean)
    coeff = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = r2_score(y_clean, y_pred)
    
    # Calculate standard error and t-statistic
    residuals = y_clean - y_pred
    mse = np.sum(residuals**2) / (n - 2)  # degrees of freedom: n - 2 for simple regression
    
    # Standard error of coefficient
    x_mean = np.mean(x_clean)
    ss_x = np.sum((x_clean - x_mean)**2)
    se_coeff = np.sqrt(mse / ss_x)
    
    # t-statistic and p-value
    t_stat = coeff / se_coeff
    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), n - 2))
    
    return {
        'Level': level_name,
        'Exposure_Type': x_name.split('_')[0].upper(),  # GPT4 or HUMAN
        'Exposure_Measure': exposure_measure.capitalize(),
        'Dependent_Variable': 'Tasks' if 'tasks' in y_name else 'DWAs',
        'N_Observations': n,
        'Coefficient': coeff,
        'Std_Error': se_coeff,
        'T_Statistic': t_stat,
        'P_Value': p_value,
        'R_Squared': r_squared,
        'Intercept': intercept
    }

# Run regressions for each combination
for level_name in levels_to_analyze:
    # Filter data for current level
    level_data = exposure_scores_with_counts[exposure_scores_with_counts['Level'] == level_name].copy()
    
    for exposure_measure in exposure_measures:
        for dep_var in dependent_vars:
            # GPT-4 regression
            x_gpt4 = level_data[f'gpt4_{exposure_measure}'].values
            y = level_data[dep_var].values
            
            result_gpt4 = run_regression(x_gpt4, y, f'gpt4_{exposure_measure}', dep_var, 
                                       level_name, exposure_measure)
            if result_gpt4:
                regression_results.append(result_gpt4)
            
            # Human regression
            x_human = level_data[f'human_{exposure_measure}'].values
            
            result_human = run_regression(x_human, y, f'human_{exposure_measure}', dep_var, 
                                        level_name, exposure_measure)
            if result_human:
                regression_results.append(result_human)

# Convert results to DataFrame
regression_df = pd.DataFrame(regression_results)

# Format the results table for better readability
regression_display = regression_df.copy()
regression_display['Coefficient'] = regression_display['Coefficient'].round(4)
regression_display['Std_Error'] = regression_display['Std_Error'].round(4)
regression_display['T_Statistic'] = regression_display['T_Statistic'].round(3)
regression_display['P_Value'] = regression_display['P_Value'].round(4)
regression_display['R_Squared'] = regression_display['R_Squared'].round(4)

# Create significance stars
def add_significance_stars(p_val):
    if p_val < 0.001:
        return "***"
    elif p_val < 0.01:
        return "**"
    elif p_val < 0.05:
        return "*"
    elif p_val < 0.1:
        return "."
    else:
        return ""

regression_display['Significance'] = regression_display['P_Value'].apply(add_significance_stars)

# Sort by level order for better presentation
level_order = ['Major Group', 'Minor Group', 'Broad Occupation', 'Detailed Occupation', 'Occupation']
regression_display['Level_Order'] = regression_display['Level'].map({level: i for i, level in enumerate(level_order)})
regression_display = regression_display.sort_values(['Level_Order', 'Dependent_Variable', 'Exposure_Type', 'Exposure_Measure'])

# Display key columns
display_cols = ['Level', 'Exposure_Type', 'Exposure_Measure', 'Dependent_Variable', 
                'N_Observations', 'Coefficient', 'Std_Error', 'T_Statistic', 'P_Value', 'Significance', 'R_Squared']

pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

print("REGRESSION RESULTS")
print("(Significance levels: *** p<0.001, ** p<0.01, * p<0.05, . p<0.1)")
print(regression_display[display_cols].to_string(index=False))

# Save results to CSV
regression_df.to_csv(f'{output_data_path}/regression_tasks_dwas_exposure.csv', index=False)
print(f"\n✓ Regression results saved to: {output_data_path}/regression_tasks_dwas_exposure.csv")

# Reset pandas display options
pd.reset_option('display.max_rows')
pd.reset_option('display.width')
pd.reset_option('display.max_columns')

REGRESSION RESULTS
(Significance levels: *** p<0.001, ** p<0.01, * p<0.05, . p<0.1)
              Level Exposure_Type Exposure_Measure Dependent_Variable  N_Observations  Coefficient  Std_Error  T_Statistic  P_Value Significance  R_Squared
        Major Group          GPT4            Alpha               DWAs              22       -24.79     132.70        -0.19     0.85                    0.00
        Major Group          GPT4             Beta               DWAs              22        -5.35      78.00        -0.07     0.95                    0.00
        Major Group          GPT4            Gamma               DWAs              22        -0.83      48.45        -0.02     0.99                    0.00
        Major Group         HUMAN            Alpha               DWAs              22        13.95     143.10         0.10     0.92                    0.00
        Major Group         HUMAN             Beta               DWAs              22        -3.67      82.87        -0.04     0.97     

<br>

<br>

<br>