#### By: Peyman Shahidi
#### Created: Oct 29, 2025
#### Last Edit: Nov 2, 2025

<br>

In [72]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [73]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
# Modify the output path accordingly
output_data_path = f'{input_data_path}/computed_objects/BLS_ONET_matchedEmpShares'
output_plot_path = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares"
output_plot_path_by_BLS_sector = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares/by_BLS_sector"
output_plot_path_by_ONET_level = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares/by_ONET_level"
output_plot_path_by_weighting_scheme = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares/by_weighting_scheme"
output_plot_path_by_dependent_var = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares/by_dependent_var"

# Toggle: if True, randomly reassign occ_totalEmpShare weights in the merged master_df
# during the merge_industry_employment_shares step. Set to False for default behavior.
randomize_occ_weights = True


In [74]:
import os
for path in [output_data_path, output_plot_path, 
             output_plot_path_by_BLS_sector, output_plot_path_by_ONET_level, 
             output_plot_path_by_weighting_scheme, output_plot_path_by_dependent_var]:
    if not os.path.exists(path):
        os.makedirs(path)

In [75]:
# Read O*NET data
ONET = pd.read_csv(f'{input_data_path}/computed_objects/ONET_cleaned_tasks.csv')

# Drop  columns to avoid double counting
# Note: In ~4k instances, the same task is mapped to multiple DWAs
ONET = ONET.drop(columns=['DWA ID', 'DWA Title'])

# Remove duplicate rows
rows_before = len(ONET)
print(f"Number of rows before removing duplicates: {rows_before:,}")
ONET = ONET.drop_duplicates().reset_index(drop=True)
rows_after = len(ONET)
print(f"Number of rows after removing duplicates: {rows_after:,}")
print(f"Duplicates removed: {rows_before - rows_after}")

# Print length of dataset
print(f"Number of rows in ONET dataset: {len(ONET):,}")

ONET.head(5)

Number of rows before removing duplicates: 22,310
Number of rows after removing duplicates: 17,953
Duplicates removed: 4357
Number of rows in ONET dataset: 17,953


Unnamed: 0,O*NET-SOC Code,Occupation Title,Task ID,Task Title,Task Type,Job Zone,Task_Time_Percentage,Hourly_Mean_Wage,Hourly_P10_Wage,Hourly_P25_Wage,...,Relevance,Base_SOC_Code,Major_Group_Code,Major_Group_Title,Minor_Group_Code,Minor_Group_Title,Broad_Occupation_Code,Broad_Occupation_Title,Detailed_Occupation_Code,Detailed_Occupation_Title
0,11-1011.00,Chief Executives,8823,Direct or coordinate an organization's financi...,Core,5,9.62,124.47,38.46,62.9,...,94.19,11-1011,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
1,11-1011.00,Chief Executives,8824,"Confer with board members, organization offici...",Core,5,9.49,124.47,38.46,62.9,...,98.79,11-1011,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
2,11-1011.00,Chief Executives,8825,Analyze operations to evaluate performance of ...,Core,5,9.22,124.47,38.46,62.9,...,100.0,11-1011,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
3,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",Core,5,10.26,124.47,38.46,62.9,...,95.84,11-1011,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
4,11-1011.00,Chief Executives,8827,"Prepare budgets for approval, including those ...",Core,5,1.46,124.47,38.46,62.9,...,90.47,11-1011,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives


In [76]:
# Load GPTs are GPTs full label dataset
gpts_full_labels = pd.read_csv(f'{input_data_path}/GPTs-are-GPTs-main/data/full_labelset.tsv', sep="\t")

# Keep relevant columns only
gpts_full_labels = gpts_full_labels[['O*NET-SOC Code', 'Task ID', 'Task', 'Task Type', 'Title', 'gpt4_exposure', 'human_labels']]

# Convert Task ID to integer
gpts_full_labels['Task ID'] = gpts_full_labels['Task ID'].astype(int)

# Remove apostrophes for consistency
gpts_full_labels = gpts_full_labels.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)

# Rename columns
gpts_full_labels = gpts_full_labels.rename(columns={
    'Task': 'Task Title',
    'Title': 'Occupation Title'
})

# Print length of dataset
print(f"Number of rows in GPTs full labels dataset: {len(gpts_full_labels):,}")


gpts_full_labels.head(5)

Number of rows in GPTs full labels dataset: 19,265


Unnamed: 0,O*NET-SOC Code,Task ID,Task Title,Task Type,Occupation Title,gpt4_exposure,human_labels
0,11-1011.00,8823,Direct or coordinate an organizations financia...,Core,Chief Executives,E2,E0
1,11-1011.00,8831,Appoint department heads or managers and assig...,Core,Chief Executives,E0,E0
2,11-1011.00,8825,Analyze operations to evaluate performance of ...,Core,Chief Executives,E2,E2
3,11-1011.00,8826,"Direct, plan, or implement policies, objective...",Core,Chief Executives,E2,E0
4,11-1011.00,8827,"Prepare budgets for approval, including those ...",Core,Chief Executives,E2,E2


In [77]:
# Merge with ONET dataset to get hierarchical codes and titles
ONET = ONET.merge(gpts_full_labels, on=['O*NET-SOC Code', 'Occupation Title', 'Task ID', 'Task Title', 'Task Type'], how='left')

# Check how many tasks were not matched
unmatched_tasks = ONET[ONET['gpt4_exposure'].isna()]
print(f"Number of unmatched tasks: {len(unmatched_tasks):,}")

Number of unmatched tasks: 791


In [78]:
# Merge with Anthropic exposure data
anthropic_exposure = pd.read_csv(f'{input_data_path}/Anthropic_EconomicIndex/automation_vs_augmentation_by_task.csv')

# Remove if all entries are filtered:
anthropic_exposure = anthropic_exposure[anthropic_exposure['filtered'] != 1].reset_index(drop=True)

# Create new columns:
# Sum feedback loop and directive into Automation
# Sum validation, iteration, and learning into Augmentation
anthropic_exposure['automation'] = anthropic_exposure.apply(lambda row: row['feedback_loop'] + row['directive'], axis=1)
anthropic_exposure['augmentation'] = anthropic_exposure.apply(lambda row: row['validation'] + row['task_iteration'] + row['learning'], axis=1)

# Assign labels: take the max of automation, augmentation, manual and assign the corresponding label
def assign_label(row):
    max_value = max(row['automation'], row['augmentation'])
    if max_value == row['automation']:
        return 'Automation'
    elif max_value == row['augmentation']:
        return 'Augmentation'

anthropic_exposure['label'] = anthropic_exposure.apply(assign_label, axis=1)

# Filter to only keep the relevant columns
anthropic_exposure = anthropic_exposure[['task_name', 'automation', 'augmentation', 'label']]

In [79]:
# Print number of unique tasks in ONET dataset
print(f"Number of unique tasks in ONET dataset: {ONET['Task Title'].nunique():,}")

# Print number of unique tasks in Anthropic exposure dataset
print(f"Number of unique tasks in Anthropic exposure dataset: {anthropic_exposure['task_name'].nunique():,}")


# Add normalized task title to ONET for merging
ONET["task_normalized"] = ONET["Task Title"].str.lower().str.strip()


# Merge ONET and Anthropic exposure data on the normalized task title
merged_data = pd.merge(ONET, anthropic_exposure[['task_name', 'automation', 'augmentation', 'label']], left_on="task_normalized", right_on="task_name", how="left")

# Fill the NaN values of the label column in merged dataset as Manual
merged_data['label'] = merged_data['label'].fillna('Manual')

# Print distribution after filling NaN values
print(f"\nDistribution of labels after filling NaN values with 'Manual':")
print(merged_data['label'].value_counts())
print(f"Total tasks: {len(merged_data):,}")

merged_data.head()

Number of unique tasks in ONET dataset: 16,913
Number of unique tasks in Anthropic exposure dataset: 2,298

Distribution of labels after filling NaN values with 'Manual':
label
Manual          15605
Augmentation     1626
Automation        722
Name: count, dtype: int64
Total tasks: 17,953


Unnamed: 0,O*NET-SOC Code,Occupation Title,Task ID,Task Title,Task Type,Job Zone,Task_Time_Percentage,Hourly_Mean_Wage,Hourly_P10_Wage,Hourly_P25_Wage,...,Broad_Occupation_Title,Detailed_Occupation_Code,Detailed_Occupation_Title,gpt4_exposure,human_labels,task_normalized,task_name,automation,augmentation,label
0,11-1011.00,Chief Executives,8823,Direct or coordinate an organization's financi...,Core,5,9.62,124.47,38.46,62.9,...,Chief Executives,11-1011,Chief Executives,,,direct or coordinate an organization's financi...,direct or coordinate an organization's financi...,0.35,0.57,Augmentation
1,11-1011.00,Chief Executives,8824,"Confer with board members, organization offici...",Core,5,9.49,124.47,38.46,62.9,...,Chief Executives,11-1011,Chief Executives,E0,E0,"confer with board members, organization offici...","confer with board members, organization offici...",0.25,0.61,Augmentation
2,11-1011.00,Chief Executives,8825,Analyze operations to evaluate performance of ...,Core,5,9.22,124.47,38.46,62.9,...,Chief Executives,11-1011,Chief Executives,E2,E2,analyze operations to evaluate performance of ...,analyze operations to evaluate performance of ...,0.31,0.66,Augmentation
3,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",Core,5,10.26,124.47,38.46,62.9,...,Chief Executives,11-1011,Chief Executives,E2,E0,"direct, plan, or implement policies, objective...",,,,Manual
4,11-1011.00,Chief Executives,8827,"Prepare budgets for approval, including those ...",Core,5,1.46,124.47,38.46,62.9,...,Chief Executives,11-1011,Chief Executives,E2,E2,"prepare budgets for approval, including those ...",,,,Manual


## Functions

In [None]:
def create_occupation_analysis(my_sector, my_onet_level,
                               merged_data, onet_occupation_code_var, onet_occupation_title_var):
    # Create occupation-level analysis for scatter plots
    # Group by occupation and calculate label fractions and task counts
    occupation_stats = []

    for (soc_code, occ_title), group in merged_data.groupby([onet_occupation_code_var, onet_occupation_title_var]):
        num_tasks = group['Task ID'].nunique()
        # num_occupations = group[onet_occupation_code_var].nunique()
        total_tasks = len(group)
        
        manual_fraction = (group['label'] == 'Manual').sum() / total_tasks
        augmentation_fraction = (group['label'] == 'Augmentation').sum() / total_tasks  
        automation_fraction = (group['label'] == 'Automation').sum() / total_tasks
        ai_fraction = augmentation_fraction + automation_fraction
        gpt4_E0_fraction = (group['gpt4_exposure'] == 'E0').sum() / total_tasks
        gpt4_E1_fraction = (group['gpt4_exposure'] == 'E1').sum() / total_tasks
        gpt4_E2_fraction = (group['gpt4_exposure'] == 'E2').sum() / total_tasks
        gpt4_aiExposure_fraction = gpt4_E1_fraction + gpt4_E2_fraction
        human_E0_fraction = (group['human_labels'] == 'E0').sum() / total_tasks
        human_E1_fraction = (group['human_labels'] == 'E1').sum() / total_tasks
        human_E2_fraction = (group['human_labels'] == 'E2').sum() / total_tasks
        human_aiExposure_fraction = human_E1_fraction + human_E2_fraction

        
        occupation_stats.append({
            f'{onet_occupation_code_var}': soc_code,
            f'{onet_occupation_title_var}': occ_title,
            'num_tasks': num_tasks,
            # 'num_occupations': num_occupations,
            'manual_fraction': manual_fraction,
            'ai_fraction': ai_fraction,
            'augmentation_fraction': augmentation_fraction,
            'automation_fraction': automation_fraction,
            'gpt4_E0_fraction': gpt4_E0_fraction,
            'gpt4_E1_fraction': gpt4_E1_fraction,
            'gpt4_E2_fraction': gpt4_E2_fraction,
            'gpt4_aiExposure_fraction': gpt4_aiExposure_fraction,
            'human_E0_fraction': human_E0_fraction,
            'human_E1_fraction': human_E1_fraction,
            'human_E2_fraction': human_E2_fraction,
            'human_aiExposure_fraction': human_aiExposure_fraction
        })

    occupation_analysis = pd.DataFrame(occupation_stats)

    return occupation_analysis



def merge_industry_employment_shares(seed, 
                                     my_sector, my_onet_level,
                                     dependent_var,
                                     onet_occupation_code_var, onet_occupation_title_var,
                                     weight_col,
                                     occupation_analysis):

    # Merge industry employment share weights for all NAICS sectors and create a master dataset
    bls_sector_shares = pd.read_csv(f'{input_data_path}/computed_objects/BLS_ONET_empShares/bls_{my_sector}_ONET{my_onet_level}_empShares.csv')

    # bls_sector_shares = bls_sector_shares[bls_sector_shares.TOT_EMP >= 50].reset_index(drop=True)

    # Ensure NAICS is string for consistent merging
    bls_sector_shares['NAICS'] = bls_sector_shares['NAICS'].astype(str)

    unique_sectors = bls_sector_shares['NAICS'].unique()
    # print(f"Found {len(unique_sectors)} unique NAICS sectors to process")

    master_dfs = []
    processed_sectors = []

    for sector_code in unique_sectors:
        sector_weights_df = bls_sector_shares[bls_sector_shares.NAICS == sector_code]

        # Merge occupation-level analysis with this sector's weights
        sector_occupation_analysis = occupation_analysis.merge(
            sector_weights_df[['NAICS', 'NAICS_TITLE', 'OCC_CODE', weight_col]],
            left_on=onet_occupation_code_var,
            right_on='OCC_CODE',
            how='inner'
        )
        # Drop the OCC_CODE column after merge
        sector_occupation_analysis = sector_occupation_analysis.drop(columns=['OCC_CODE'])

        # Append to master list
        master_dfs.append(sector_occupation_analysis)
        processed_sectors.append(sector_code)

    # Concatenate all sector-specific records into a master dataframe
    master_df = pd.concat(master_dfs, ignore_index=True)

    # Calculate sum of weight_col
    master_df = (
        master_df.groupby([onet_occupation_code_var, onet_occupation_title_var, dependent_var, 'num_tasks'], as_index=False)[weight_col]
        .sum(numeric_only=True)
    )
    # master_df = master_df[~master_df[onet_occupation_title_var].isin(['Fast Food and Counter Workers'])]
    # master_df = master_df[~master_df[onet_occupation_title_var].isin(['Retail Salespersons', 'Fast Food and Counter Workers', 'General and Operations Managers'])]
    # print(f"Aggregated to {len(master_df):,} unique occupations (summed over selected columns).")

    # Sort by weight_col descending
    master_df = master_df.sort_values(by=weight_col, ascending=False).reset_index(drop=True)

    # OPTIONAL: randomize (shuffle) the existing values in the selected weight column
    # If enabled, we take the existing values in `weight_col` and randomly permute
    # them across rows. This preserves the original values (no normalization or new
    # values), only their assignment to rows changes. The RNG uses
    # `randomize_weights_seed` for reproducibility when set.
    if globals().get('randomize_occ_weights', False) and seed > 0:
        rng = np.random.RandomState(seed)

        # Extract current values (may contain NaNs). We'll permute the full array.
        vals = master_df[weight_col].to_numpy()
        if len(vals) > 0:
            perm = rng.permutation(len(vals))
            shuffled_vals = vals[perm]
            master_df[weight_col] = shuffled_vals

    # Save master dataframe to CSV
    out_dir = f"{output_data_path}/BLS{my_sector}_ONET{my_onet_level}/{dependent_var}/"
    os.makedirs(out_dir, exist_ok=True)
    master_out = f"{out_dir}/BLS{my_sector}_ONET{my_onet_level}_taskExposureAIability_{weight_col}_{dependent_var}.csv"
    master_df.to_csv(master_out, index=False)


    return master_df



def plot_industry_count_distribution(my_sector, my_onet_level,
                                     master_df, onet_occupation_code_var, onet_occupation_title_var):

    # Count how many industries each occupation appears in
    occupation_industry_counts = master_df.groupby([onet_occupation_code_var, onet_occupation_title_var])['NAICS'].nunique().reset_index()
    occupation_industry_counts = occupation_industry_counts.rename(columns={'NAICS': 'num_industries'})
    occupation_industry_counts = occupation_industry_counts.sort_values(by=['num_industries', onet_occupation_code_var], ascending=[False, True]).reset_index(drop=True)

    # Plot histogram of number of industries per occupation
    plt.figure(figsize=(10, 6))
    plt.hist(occupation_industry_counts['num_industries'], bins=range(1, occupation_industry_counts['num_industries'].max() + 2), align='left', color='skyblue', edgecolor='black')
    plt.title('Distribution of Number of Industries per Occupation')
    plt.xlabel('Number of Industries')
    plt.ylabel('Number of Occupations')
    plt.savefig(f'{output_plot_path}/BLS{my_sector}_ONET{my_onet_level}_countDist.png', dpi=300)
    plt.close()



# Regression and Plot different weighting schemes
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import os

def plot_weighted_regression_and_binned_scatter(my_sector, my_onet_level,
                                                dependent_var, dependent_var_title, dependent_var_save_name_prefix,
                                                master_df, weight_col,
                                                plot_title_suffix, plot_save_name_prefix, q=20):

    # ================================
    # DATA PREP
    # ================================
    if 'NAICS' not in master_df.columns or master_df['NAICS'].isna().all():
        master_df['NAICS'] = 'All_Industries'
    df = master_df[[dependent_var, 'num_tasks', weight_col, 'NAICS']].copy()
    df = df[df[dependent_var].notna() & df['num_tasks'].notna()].copy()

    # Coerce numeric
    df['num_tasks'] = pd.to_numeric(df['num_tasks'], errors='coerce')
    df[dependent_var] = pd.to_numeric(df[dependent_var], errors='coerce')
    df[weight_col] = pd.to_numeric(df[weight_col], errors='coerce')

    # Drop rows with missing core vars
    df = df.dropna(subset=['num_tasks', dependent_var, weight_col]).reset_index(drop=True)

    # ================================
    # REGRESSIONS (WLS + WLS with FE)
    # ================================
    model_wls = smf.wls(f'{dependent_var} ~ num_tasks', data=df, weights=df[weight_col]).fit(cov_type='HC3')
    # print(model_wls.summary())

    model_wls_fe = smf.wls(f'{dependent_var} ~ num_tasks + C(NAICS)', data=df, weights=df[weight_col]).fit(cov_type='HC3')
    # print(model_wls_fe.summary())

    # Save regression coefficient, std, and p-value for num_tasks for later inspection
    try:
        reg_out_dir = f"{output_data_path}/regressions"
        os.makedirs(reg_out_dir, exist_ok=True)

        rows = []
        for mname, m in [('WLS', model_wls), ('WLS_FE', model_wls_fe)]:
            if 'num_tasks' in m.params.index:
                coef = float(m.params['num_tasks'])
            else:
                coef = float('nan')
            try:
                se = float(m.bse['num_tasks'])
            except Exception:
                se = float('nan')
            try:
                pval = float(m.pvalues['num_tasks'])
            except Exception:
                pval = float('nan')
            try:
                nobs = int(m.nobs)
            except Exception:
                nobs = len(df)

            rows.append({
                'BLS_sector_level': my_sector,
                'ONET_level': my_onet_level,
                'dependent_var': dependent_var,
                'weight_col': weight_col,
                'plot_prefix': plot_save_name_prefix,
                'model': mname,
                'coef_num_tasks': coef,
                'std_err': se,
                'pvalue': pval,
                'n_obs': nobs
            })

        reg_df = pd.DataFrame(rows)
        out_file = f"{reg_out_dir}/reg_BLS{my_sector}_ONET{my_onet_level}_{plot_save_name_prefix}_{dependent_var}.csv"
        reg_df.to_csv(out_file, index=False)
        # print(f"Saved regression results to {out_file}")
    except Exception as e:
        print(f"Failed to save regression results: {e}")

    # ================================
    # BINNED SCATTER (weighted means within unweighted num_tasks bins)
    # ================================
    unique_vals = df['num_tasks'].nunique()
    q_use = min(q, unique_vals) if unique_vals > 1 else 1

    try:
        bins = pd.qcut(df['num_tasks'], q=q_use, duplicates='drop')
    except Exception:
        bins = pd.cut(df['num_tasks'], bins=q_use)

    def weighted_stats(g):
        w = g[weight_col].to_numpy()
        x = g['num_tasks'].to_numpy()
        y = g[dependent_var].to_numpy()
        sum_w = np.nansum(w)
        x_wmean = np.average(x, weights=w) if sum_w > 0 else np.nan
        y_wmean = np.average(y, weights=w) if sum_w > 0 else np.nan
        return pd.Series({
            'num_tasks_wmean': x_wmean,
            f'{dependent_var}_wmean': y_wmean,
            'sum_w': sum_w,
            'n': len(g)
        })

    binned = df.groupby(bins, observed=True).apply(weighted_stats).reset_index(drop=True)

    # # ================================
    # # PLOT
    # # ================================
    # plt.figure(figsize=(12, 8))

    # # Bubble area scaling by summed weights
    # s_min, s_max = 40, 900
    # max_w = binned['sum_w'].max()
    # if pd.notna(max_w) and max_w > 0:
    #     sizes = s_min + (s_max - s_min) * (binned['sum_w'] / max_w)
    # else:
    #     sizes = np.full(len(binned), s_min)

    # # Single color, no colorbar
    # plt.scatter(
    #     binned['num_tasks_wmean'], binned[f'{dependent_var}_wmean'],
    #     s=sizes, c='steelblue', edgecolor='black', linewidth=0.8, alpha=0.9
    # )

    # plt.xlabel('Number of Tasks in Occupation')
    # plt.ylabel(f'{dependent_var_title} in Occupation')
    # if dependent_var == 'aiFraction':
    #     plt.ylim(-0.02, 0.5)
    # else:
    #     plt.ylim(-0.02, 1.02)
    # plt.title(f'Bin Scatter: {dependent_var_title} vs. Number of Tasks in Occupation\n\n({plot_title_suffix})')

    # # ================================
    # # OVERLAY WLS LINE + 95% CI
    # # ================================
    # x_line = np.linspace(df['num_tasks'].min(), df['num_tasks'].max(), 200)
    # preds = model_wls.get_prediction(pd.DataFrame({'num_tasks': x_line}))
    # pred_df = preds.summary_frame(alpha=0.05)

    # slope_wls = float(model_wls.params['num_tasks'])
    # plt.plot(x_line, pred_df['mean'], color='red', lw=2, label=f'WLS fit (slope={slope_wls:.4f})')
    # plt.fill_between(x_line, pred_df['mean_ci_lower'], pred_df['mean_ci_upper'], color='red', alpha=0.18, label='95% CI')

    # # ================================
    # # OVERLAY FE LINE + 95% CI (average over NAICS categories)
    # # ================================
    # cats = pd.unique(df['NAICS'])
    # mean_lines, low_lines, up_lines = [], [], []
    # for g in cats:
    #     Xg = pd.DataFrame({'num_tasks': x_line, 'NAICS': g})
    #     if pd.api.types.is_categorical_dtype(df['NAICS']):
    #         Xg['NAICS'] = pd.Categorical([g]*len(x_line), categories=df['NAICS'].cat.categories)
    #     sf = model_wls_fe.get_prediction(Xg).summary_frame(alpha=0.05)
    #     mean_lines.append(sf['mean'].to_numpy())
    #     low_lines.append(sf['mean_ci_lower'].to_numpy())
    #     up_lines.append(sf['mean_ci_upper'].to_numpy())

    # mean_line = np.vstack(mean_lines).mean(axis=0)
    # low_band  = np.vstack(low_lines).mean(axis=0)
    # up_band   = np.vstack(up_lines).mean(axis=0)
    # slope_fe = float(model_wls_fe.params['num_tasks'])

    # plt.plot(x_line, mean_line, color='navy', lw=2, ls='--', label=f'FE fit (Avg over NAICS, slope={slope_fe:.4f})')
    # plt.fill_between(x_line, low_band, up_band, color='navy', alpha=0.12)

    # plt.grid(alpha=0.3)
    # plt.legend(loc='upper right')

    # ================================
    # SAVE / SHOW
    # ================================
    # out_file = f"{output_plot_path}/{plot_save_name_prefix}_BLS{my_sector}_ONET{my_onet_level}_aiFraction.png"

    # # Save in three different folders:
    # out_file = f"{output_plot_path_by_BLS_sector}/BLS{my_sector}_{plot_save_name_prefix}_ONET{my_onet_level}_{dependent_var_save_name_prefix}.png"
    # plt.savefig(out_file, bbox_inches='tight', dpi=300)
    # out_file = f"{output_plot_path_by_ONET_level}/ONET{my_onet_level}_{plot_save_name_prefix}_BLS{my_sector}_{dependent_var_save_name_prefix}.png"
    # plt.savefig(out_file, bbox_inches='tight', dpi=300)
    # out_file = f"{output_plot_path_by_weighting_scheme}/{plot_save_name_prefix}_BLS{my_sector}_ONET{my_onet_level}_{dependent_var_save_name_prefix}.png"
    # plt.savefig(out_file, bbox_inches='tight', dpi=300)
    # out_file = f"{output_plot_path_by_dependent_var}/{dependent_var_save_name_prefix}_{plot_save_name_prefix}_BLS{my_sector}_ONET{my_onet_level}.png"
    # plt.savefig(out_file, bbox_inches='tight', dpi=300)
    # plt.close()



def regress_exposure_on_AIability(my_sector, my_onet_level, 
                                  onet_occupation_code_var, onet_occupation_title_var,
                                  weight_col):
    # Read data
    exposure_df = pd.read_csv(f'{output_data_path}/BLS{my_sector}_ONET{my_onet_level}/human_E1_fraction/BLS{my_sector}_ONET{my_onet_level}_taskExposureAIability_{weight_col}_human_E1_fraction.csv')
    ai_ability_df = pd.read_csv(f'{output_data_path}/BLS{my_sector}_ONET{my_onet_level}/ai_fraction/BLS{my_sector}_ONET{my_onet_level}_taskExposureAIability_{weight_col}_ai_fraction.csv')

    # Merge on occupation code and title
    # print length before merge
    # print(f"Exposure data records: {len(exposure_df):,}, AI ability data records: {len(ai_ability_df):,}")
    merged_df = pd.merge(exposure_df, ai_ability_df,
                             on=[onet_occupation_code_var, onet_occupation_title_var, 'num_tasks', weight_col],
                             how='inner')
    # print(f"Merged exposure and AI ability data: {len(merged_df):,} records")

    # Save merged dataframe
    merged_df.to_csv(f'{output_data_path}/BLS{my_sector}_ONET{my_onet_level}/BLS{my_sector}_ONET{my_onet_level}_exposure_vs_AIability.csv', index=False)

    # Perform regression analysis
    model = smf.wls(f'ai_fraction ~ human_E1_fraction + num_tasks', data=merged_df, weights=merged_df[weight_col]).fit(cov_type='HC3')
    # print(model.summary())

    # Save regression results
    try:
        reg_out_dir = f"{output_data_path}/regression_BLS{my_sector}_ONET{my_onet_level}"
        os.makedirs(reg_out_dir, exist_ok=True)

        rows = []
        coef = float(model.params['human_E1_fraction']) if 'human_E1_fraction' in model.params.index else float('nan')
        try:
            se = float(model.bse['human_E1_fraction'])
        except Exception:
            se = float('nan')
        try:
            pval = float(model.pvalues['human_E1_fraction'])
        except Exception:
            pval = float('nan')
        try:
            nobs = int(model.nobs)
        except Exception:
            nobs = len(merged_df)

        rows.append({
            'BLS_sector_level': my_sector,
            'ONET_level': my_onet_level,
            'weight_col': weight_col,
            'model': 'WLS_exposure_on_AIability',
            'coef_human_E1_fraction': coef,
            'std_err': se,
            'pvalue': pval,
            'n_obs': nobs
        })
        reg_df = pd.DataFrame(rows)
        out_file = f"{reg_out_dir}/reg_BLS{my_sector}_ONET{my_onet_level}_exposure_on_AIability_{weight_col}.csv"
        reg_df.to_csv(out_file, index=False)
        # print(f"Saved regression results to {out_file}")
    except Exception as e:
        print(f"Failed to save regression results: {e}")


In [83]:
seed = 42
my_sector, my_onet_level = ['sector', 'detailed']
dependent_var_list = ['ai_fraction', 'human_E1_fraction']
onet_occupation_code_var, onet_occupation_title_var = ['Detailed_Occupation_Code', 'Detailed_Occupation_Title']
weight_col = 'occ_totalEmpShare'
occupation_analysis = create_occupation_analysis(my_sector, my_onet_level, merged_data, onet_occupation_code_var, onet_occupation_title_var)

weight_cols = ['occ_totalEmpShare',         # Weight each occupation by occupation's share of total employment (ignoring sector shares)
               'sectorEmpShare',            # Weight each occupation by its sector's share of total employment
               'occ_sectorEmpShare']        # Weight each occupation by its share of employment within its sector and weight sectors equally  

# Merge industry employment share weights for all NAICS sectors and create a master dataset
bls_sector_shares = pd.read_csv(f'{input_data_path}/computed_objects/BLS_ONET_empShares/bls_{my_sector}_ONET{my_onet_level}_empShares.csv')

# Ensure NAICS is string for consistent merging
bls_sector_shares['NAICS'] = bls_sector_shares['NAICS'].astype(str)


# occupation_analysis_desired_columns = [onet_occupation_code_var, onet_occupation_title_var, 'num_tasks']
# output_df = occupation_analysis[occupation_analysis_desired_columns].copy()
output_df = occupation_analysis.copy()

if 'occ_totalEmpShare' in weight_cols:
    bls_occ_totalEmpShares = bls_sector_shares[['OCC_CODE', 'TOT_EMP']].groupby('OCC_CODE').sum().reset_index()

    # Convert to % and change variable name
    bls_occ_totalEmpShares['TOT_EMP'] = bls_occ_totalEmpShares['TOT_EMP'] / bls_occ_totalEmpShares['TOT_EMP'].sum()
    bls_occ_totalEmpShares = bls_occ_totalEmpShares.rename(columns={'TOT_EMP': 'occ_totalEmpShare'})

    # Add weight column to output_df
    output_df = output_df.merge(bls_occ_totalEmpShares, left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')

if 'sectorEmpShare' in weight_cols:
    sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'TOT_EMP']].groupby(['NAICS', 'NAICS_TITLE']).sum('TOT_EMP')

    # Convert to % and change variable name
    sector_weights_df['TOT_EMP'] = sector_weights_df['TOT_EMP'] / sector_weights_df['TOT_EMP'].sum()
    sector_weights_df = sector_weights_df.rename(columns={'TOT_EMP': 'sectorEmpShare'})

    # Merge back sector weights to bls dataset to get sector-by-sector occupation data with sector weights
    bls_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE']].merge(sector_weights_df, on='NAICS', how='left')

    
    # Aggregate sector weights across occupations
    bls_sector_weights_occupation_df = bls_sector_weights_df.groupby('OCC_CODE').sum('sectorEmpShare')

    # Add weight column to output_df
    output_df = output_df.merge(bls_sector_weights_occupation_df, left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')

if 'occ_sectorEmpShare' in weight_cols:
    within_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP']].copy()
    within_sector_weights_df['occ_sectorEmpShare'] = within_sector_weights_df['TOT_EMP'] / within_sector_weights_df.groupby(['NAICS', 'NAICS_TITLE'])['TOT_EMP'].transform('sum')

    # Calculate sum over all sectors
    within_sector_weights = within_sector_weights_df[['OCC_CODE', 'OCC_TITLE', 'occ_sectorEmpShare']].groupby(['OCC_CODE', 'OCC_TITLE']).sum()

    # Add weight column to output_df
    output_df = output_df.merge(within_sector_weights, left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')


# Drop the 'OCC_CODE' column
output_df = output_df.drop(columns='OCC_CODE')

# Save master dataframe to CSV
output_df.to_csv(f'{output_data_path}/BLS{my_sector}_ONET{my_onet_level}.csv', index=False)


In [18]:
# # Drop the supplemental tasks
# merged_data = merged_data[merged_data['Task Type'] != 'Supplemental'].reset_index(drop=True)

# # Drop rows whose Occupation Title includes 'Teachers, Postsecondary'
# merged_data = merged_data[~merged_data['Occupation Title'].str.contains('Teachers, Postsecondary')].reset_index(drop=True)

In [29]:
bls_sector_levels = ['sector']#, '3-digit', '4-digit', '5-digit', '6-digit']

onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']

weight_cols = ['occ_totalEmpShare',         # Weight each occupation by occupation's share of total employment (ignoring sector shares)
               'sectorEmpShare',            # Weight each occupation by its sector's share of total employment
               'occ_sectorEmpShare']        # Weight each occupation by its share of employment within its sector and weight sectors equally  
plot_title_suffix_list = ['Weighted by Occupation Share of Total Employment',
                          'Weighted by Sector Share of Total Employment',
                          'Weighted by Occupation Employment Share of Sector']
plot_save_name_prefix_list = ['occupationEmpShareWeights',
                              'sectorEmpShareWeights',
                              'occupationEmpShareWithinSectorWeights']

dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']
dependent_var_title_list = ['Fraction of AI Tasks (Anthropic)', r'Fraction of Human $\alpha$ Exposure (Eloundou et al.)']#, r'Fraction of Human \gamma AI Exposure (Eloundou et al.)']#, r'Fraction of GPT-4 $\alpha$ Exposure (Eloundou et al.)']
dependent_var_save_name_prefix_list = ['aiFraction', 'humanAiExposureFraction']#, 'humanAiExposureFraction']#, 'gpt4AiExposureFraction']


master_df_all_weights = pd.DataFrame()
# Run the analysis for each BLS sector level and ONET occupation level
for my_sector in bls_sector_levels:
    for my_onet_level, onet_occupation_code_var, onet_occupation_title_var in zip(onet_levels, onet_occupation_code_vars, onet_occupation_title_vars):
        if my_onet_level != 'detailed':
                    continue
        
        # Get occupation data
        occupation_analysis = create_occupation_analysis(my_sector, my_onet_level,
                                                         merged_data, onet_occupation_code_var, onet_occupation_title_var)

        # Regression and Plot different weighting schemes
        for weight_col, plot_title_suffix, plot_save_name_prefix in zip(weight_cols, plot_title_suffix_list, plot_save_name_prefix_list):
            # if (weight_col == 'ONE') or (weight_col == 'occ_sectorEmpShare'):
            #     continue

            for dependent_var, dependent_var_title, dependent_var_save_name_prefix in zip(dependent_var_list, dependent_var_title_list, dependent_var_save_name_prefix_list):
                # Get master dataframe with industry employment shares merged to occupation analysis data
                master_df = merge_industry_employment_shares(0,
                                                            my_sector,
                                                            my_onet_level, 
                                                            dependent_var,
                                                            onet_occupation_code_var, onet_occupation_title_var,
                                                            weight_col,
                                                            occupation_analysis)

                # # Plot industry occupation count
                # plot_industry_count_distribution(my_sector, my_onet_level, 
                #                                 master_df, onet_occupation_code_var, onet_occupation_title_var)            

                master_df_all_weights = pd.concat([master_df_all_weights, master_df], ignore_index=True)
bls_sector_levels = ['sector']#, '3-digit', '4-digit', '5-digit', '6-digit']

onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']

weight_cols = ['occ_totalEmpShare',         # Weight each occupation by occupation's share of total employment (ignoring sector shares)
               'sectorEmpShare',            # Weight each occupation by its sector's share of total employment
               'occ_sectorEmpShare']        # Weight each occupation by its share of employment within its sector and weight sectors equally  
plot_title_suffix_list = ['Weighted by Occupation Share of Total Employment',
                          'Weighted by Sector Share of Total Employment',
                          'Weighted by Occupation Employment Share of Sector']
plot_save_name_prefix_list = ['occupationEmpShareWeights',
                              'sectorEmpShareWeights',
                              'occupationEmpShareWithinSectorWeights']

dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']
dependent_var_title_list = ['Fraction of AI Tasks (Anthropic)', r'Fraction of Human $\alpha$ Exposure (Eloundou et al.)']#, r'Fraction of Human \gamma AI Exposure (Eloundou et al.)']#, r'Fraction of GPT-4 $\alpha$ Exposure (Eloundou et al.)']
dependent_var_save_name_prefix_list = ['aiFraction', 'humanAiExposureFraction']#, 'humanAiExposureFraction']#, 'gpt4AiExposureFraction']

weight_cols_numericCounter_list = [i for i in range(1, len(weight_cols) + 1)]

master_df_all_weights = pd.DataFrame()
# Run the analysis for each BLS sector level and ONET occupation level
for my_sector in bls_sector_levels:
    for my_onet_level, onet_occupation_code_var, onet_occupation_title_var in zip(onet_levels, onet_occupation_code_vars, onet_occupation_title_vars):
        if my_onet_level != 'detailed':
                    continue
        
        # Get occupation data
        occupation_analysis = create_occupation_analysis(my_sector, my_onet_level,
                                                         merged_data, onet_occupation_code_var, onet_occupation_title_var)

        # Regression and Plot different weighting schemes
        for weight_col, plot_title_suffix, plot_save_name_prefix, counter in zip(weight_cols, plot_title_suffix_list, plot_save_name_prefix_list, weight_cols_numericCounter_list):

            for dependent_var, dependent_var_title, dependent_var_save_name_prefix in zip(dependent_var_list, dependent_var_title_list, dependent_var_save_name_prefix_list):
                # Get master dataframe with industry employment shares merged to occupation analysis data
                master_df = merge_industry_employment_shares(0,
                                                            my_sector,
                                                            my_onet_level, 
                                                            dependent_var,
                                                            onet_occupation_code_var, onet_occupation_title_var,
                                                            weight_col,
                                                            occupation_analysis)
                display(master_df.head())
                
                if len(master_df_all_weights) == 0:
                    master_df_all_weights = master_df.copy()

            if counter > 1:
                master_df_all_weights = master_df_all_weights.merge(master_df[[onet_occupation_code_var, onet_occupation_title_var, weight_col]],
                                                                    on=[onet_occupation_code_var, onet_occupation_title_var],
                                                                    how='inner') 

                # # Plot industry occupation count
                # plot_industry_count_distribution(my_sector, my_onet_level, 
                #                                 master_df, onet_occupation_code_var, onet_occupation_title_var)            


master_df_all_weights
# # Run the analysis for each BLS sector level and ONET occupation level
# for my_sector in bls_sector_levels:
#     for my_onet_level, onet_occupation_code_var, onet_occupation_title_var in zip(onet_levels, onet_occupation_code_vars, onet_occupation_title_vars):
#         if my_onet_level != 'detailed':
#                     continue
        
#         # Get occupation data
#         occupation_analysis = create_occupation_analysis(my_sector, my_onet_level,
#                                                          merged_data, onet_occupation_code_var, onet_occupation_title_var)

#         # Regression and Plot different weighting schemes
#         for weight_col, plot_title_suffix, plot_save_name_prefix in zip(weight_cols, plot_title_suffix_list, plot_save_name_prefix_list):
#             # if (weight_col == 'ONE') or (weight_col == 'occ_sectorEmpShare'):
#             #     continue

#             for dependent_var, dependent_var_title, dependent_var_save_name_prefix in zip(dependent_var_list, dependent_var_title_list, dependent_var_save_name_prefix_list):
#                 # Get master dataframe with industry employment shares merged to occupation analysis data
#                 master_df = merge_industry_employment_shares(0,
#                                                             my_sector,
#                                                             my_onet_level, 
#                                                             dependent_var,
#                                                             onet_occupation_code_var, onet_occupation_title_var,
#                                                             weight_col,
#                                                             occupation_analysis)

#                 # # Plot industry occupation count
#                 # plot_industry_count_distribution(my_sector, my_onet_level, 
#                 #                                 master_df, onet_occupation_code_var, onet_occupation_title_var)

#                 plot_weighted_regression_and_binned_scatter(my_sector, my_onet_level, 
#                                                             dependent_var, dependent_var_title, dependent_var_save_name_prefix,
#                                                             master_df, weight_col, 
#                                                             plot_title_suffix, plot_save_name_prefix)
                
#             regress_exposure_on_AIability(my_sector, my_onet_level,
#                                             onet_occupation_code_var, onet_occupation_title_var,
#                                             weight_col)

Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,ai_fraction,num_tasks,occ_totalEmpShare
0,41-2031,Retail Salespersons,0.17,24,0.0
1,35-3023,Fast Food and Counter Workers,0.04,47,0.0
2,11-1021,General and Operations Managers,0.06,17,0.0
3,41-2011,Cashiers,0.14,29,0.0
4,29-1141,Registered Nurses,0.09,137,0.0


Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,human_E1_fraction,num_tasks,occ_totalEmpShare
0,41-2031,Retail Salespersons,0.29,24,0.0
1,35-3023,Fast Food and Counter Workers,0.06,47,0.0
2,11-1021,General and Operations Managers,0.12,17,0.0
3,41-2011,Cashiers,0.14,29,0.0
4,29-1141,Registered Nurses,0.12,137,0.0


Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,ai_fraction,num_tasks,sectorEmpShare
0,11-1011,Chief Executives,0.27,49,0.18
1,13-1141,"Compensation, Benefits, and Job Analysis Speci...",0.17,24,0.18
2,15-1232,Computer User Support Specialists,0.44,16,0.18
3,43-3061,Procurement Clerks,0.05,19,0.18
4,43-4051,Customer Service Representatives,0.2,15,0.18


Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,human_E1_fraction,num_tasks,sectorEmpShare
0,11-1011,Chief Executives,0.12,49,0.18
1,13-1141,"Compensation, Benefits, and Job Analysis Speci...",0.29,24,0.18
2,15-1232,Computer User Support Specialists,0.44,16,0.18
3,43-3061,Procurement Clerks,0.26,19,0.18
4,43-4051,Customer Service Representatives,0.4,15,0.18


Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,ai_fraction,num_tasks,occ_sectorEmpShare
0,11-1021,General and Operations Managers,0.06,17,0.57
1,45-2092,"Farmworkers and Laborers, Crop, Nursery, and G...",0.07,29,0.47
2,43-4051,Customer Service Representatives,0.2,15,0.44
3,53-7062,"Laborers and Freight, Stock, and Material Move...",0.0,39,0.42
4,43-9061,"Office Clerks, General",0.38,21,0.37


Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,human_E1_fraction,num_tasks,occ_sectorEmpShare
0,11-1021,General and Operations Managers,0.12,17,0.57
1,45-2092,"Farmworkers and Laborers, Crop, Nursery, and G...",0.0,29,0.47
2,43-4051,Customer Service Representatives,0.4,15,0.44
3,53-7062,"Laborers and Freight, Stock, and Material Move...",0.0,39,0.42
4,43-9061,"Office Clerks, General",0.24,21,0.37


Unnamed: 0,Detailed_Occupation_Code,Detailed_Occupation_Title,ai_fraction,num_tasks,occ_totalEmpShare,sectorEmpShare,occ_sectorEmpShare
0,41-2031,Retail Salespersons,0.17,24,0.00,0.18,0.28
1,35-3023,Fast Food and Counter Workers,0.04,47,0.00,0.17,0.28
2,11-1021,General and Operations Managers,0.06,17,0.00,0.18,0.57
3,41-2011,Cashiers,0.14,29,0.00,0.18,0.25
4,29-1141,Registered Nurses,0.09,137,0.00,0.17,0.18
...,...,...,...,...,...,...,...
733,49-9045,"Refractory Materials Repairers, Except Brickma...",0.00,16,0.00,0.02,0.00
734,51-7031,"Model Makers, Wood",0.00,14,0.00,0.04,0.00
735,13-1074,Farm Labor Contractors,0.00,8,0.00,0.00,0.00
736,51-2061,Timing Device Assemblers and Adjusters,0.00,17,0.00,0.02,0.00


In [122]:
bls_sector_levels = ['sector']#, '3-digit', '4-digit', '5-digit', '6-digit']

onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']

weight_cols = ['occ_totalEmpShare',         # Weight each occupation by occupation's share of total employment (ignoring sector shares)
               'sectorEmpShare',            # Weight each occupation by its sector's share of total employment
               'occ_sectorEmpShare']        # Weight each occupation by its share of employment within its sector and weight sectors equally  
plot_title_suffix_list = ['Weighted by Occupation Share of Total Employment',
                          'Weighted by Sector Share of Total Employment',
                          'Weighted by Occupation Employment Share of Sector']
plot_save_name_prefix_list = ['occupationEmpShareWeights',
                              'sectorEmpShareWeights',
                              'occupationEmpShareWithinSectorWeights']

dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']
dependent_var_title_list = ['Fraction of AI Tasks (Anthropic)', r'Fraction of Human $\alpha$ Exposure (Eloundou et al.)']#, r'Fraction of Human \gamma AI Exposure (Eloundou et al.)']#, r'Fraction of GPT-4 $\alpha$ Exposure (Eloundou et al.)']
dependent_var_save_name_prefix_list = ['aiFraction', 'humanAiExposureFraction']#, 'humanAiExposureFraction']#, 'gpt4AiExposureFraction']

# Run the analysis for each BLS sector level and ONET occupation level
for my_sector in bls_sector_levels:
    for my_onet_level, onet_occupation_code_var, onet_occupation_title_var in zip(onet_levels, onet_occupation_code_vars, onet_occupation_title_vars):
        if my_onet_level != 'detailed':
                    continue
        
        # Get occupation data
        occupation_analysis = create_occupation_analysis(my_sector, my_onet_level,
                                                         merged_data, onet_occupation_code_var, onet_occupation_title_var)

        # Regression and Plot different weighting schemes
        for weight_col, plot_title_suffix, plot_save_name_prefix in zip(weight_cols, plot_title_suffix_list, plot_save_name_prefix_list):
            # if (weight_col == 'ONE') or (weight_col == 'occ_sectorEmpShare'):
            #     continue

            for dependent_var, dependent_var_title, dependent_var_save_name_prefix in zip(dependent_var_list, dependent_var_title_list, dependent_var_save_name_prefix_list):
                # Get master dataframe with industry employment shares merged to occupation analysis data
                master_df = merge_industry_employment_shares(0,
                                                            my_sector,
                                                            my_onet_level, 
                                                            dependent_var,
                                                            onet_occupation_code_var, onet_occupation_title_var,
                                                            weight_col,
                                                            occupation_analysis)

                # # Plot industry occupation count
                # plot_industry_count_distribution(my_sector, my_onet_level, 
                #                                 master_df, onet_occupation_code_var, onet_occupation_title_var)

                plot_weighted_regression_and_binned_scatter(my_sector, my_onet_level, 
                                                            dependent_var, dependent_var_title, dependent_var_save_name_prefix,
                                                            master_df, weight_col, 
                                                            plot_title_suffix, plot_save_name_prefix)
                
            regress_exposure_on_AIability(my_sector, my_onet_level,
                                            onet_occupation_code_var, onet_occupation_title_var,
                                            weight_col)

In [123]:
# After the loops: combine all per-iteration regression outputs into a master file
import os, glob
paths_list = ['regressions', f'regression_BLS{my_sector}_ONET{my_onet_level}']
for path in paths_list:
    reg_out_dir = f'{output_data_path}/{path}'
    os.makedirs(reg_out_dir, exist_ok=True)
    reg_files = glob.glob(os.path.join(reg_out_dir, 'reg_BLS*.csv'))

    combined = pd.concat([pd.read_csv(f) for f in reg_files], ignore_index=True)
    combined = combined.sort_values(by=['model', 'BLS_sector_level', 'ONET_level', 'weight_col'], ascending=True).reset_index(drop=True)

    combined = combined[(combined['model'] == 'WLS') | (combined['model'] == 'WLS_exposure_on_AIability')].reset_index(drop=True)
    combined = combined.drop(columns=['model'])

    try:
        # Drop unimportant entries
        combined = combined.drop(columns=['plot_prefix'])
        combined = combined[combined['ONET_level'] != 'major']
        # combined = combined[(combined['weight_col'] != 'ONE') & (combined['weight_col'] != 'occ_sectorEmpShare')].reset_index(drop=True)

        # Sort order of entries
        # Create mapping dicts
        dependent_var_map = {'ai_fraction': 0, 'gpt4_E1_fraction': 2, 'human_E1_fraction': 1}
        bls_map = {v: i for i, v in enumerate(bls_sector_levels)}
        onet_map = {v: i for i, v in enumerate(onet_levels)}
        weight_map = {v: i for i, v in enumerate(weight_cols)}

        # Sort by all three with different mappings
        combined = combined.sort_values(
            by=["weight_col", "dependent_var", "BLS_sector_level", "ONET_level"],
            key=lambda col: (
                col.map(dependent_var_map) if col.name == "dependent_var" else
                col.map(weight_map) if col.name == "weight_col" else
                col.map(bls_map) if col.name == "BLS_sector_level" else
                col.map(onet_map)
            )
        ).reset_index(drop=True)

        master_file = os.path.join(reg_out_dir, 'master_regressions.csv')
        combined.to_csv(master_file, index=False)
        print(f"Combined {len(reg_files)} regression files into {master_file}")
    except Exception as e:
        master_file = os.path.join(reg_out_dir, 'master_regressions.csv')
        combined.to_csv(master_file, index=False)
        print(f"Combined {len(reg_files)} regression files into {master_file}")

    

Combined 6 regression files into ../data/computed_objects/BLS_ONET_matchedEmpShares/regressions/master_regressions.csv
Combined 3 regression files into ../data/computed_objects/BLS_ONET_matchedEmpShares/regression_BLSsector_ONETdetailed/master_regressions.csv


## Placebo Test: Reshuffle Task-Occupation Assignment and Repeat the Same Analysis

In [124]:
# ---- Begin: 100-seed reshuffle + analysis ----
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# List of unit columns exactly as you specified
unit_cols = [
    "O*NET-SOC Code",
    "Occupation Title",
    "Base_SOC_Code",
    "Major_Group_Code",
    "Major_Group_Title",
    "Minor_Group_Code",
    "Minor_Group_Title",
    "Broad_Occupation_Code",
    "Broad_Occupation_Title",
    "Detailed_Occupation_Code",
    "Detailed_Occupation_Title"
]

def reshuffle_tasks_preserve_unit_counts(merged_df, seed, unit_cols):
    """
    Shuffle task records across units while preserving each unit's number of tasks.
    - unit_cols: list of columns that define a unit (will remain as unit identity).
    - All other columns (including 'Task ID' and 'Task Title' and other task properties)
      are considered task properties and move with the task to a new unit.
    Returns a new reshuffled DataFrame with same columns and same number of rows.
    """
    # Defensive copy
    df = merged_df.copy()

    # Determine task/property columns = all columns except unit columns
    task_columns = [c for c in df.columns if c not in unit_cols]

    # Compute unit-level counts (preserve order)
    unit_counts = df.groupby(unit_cols, sort=False).size().reset_index(name='n_tasks')

    # Extract the task pool (task properties only)
    task_pool = df[task_columns].sample(frac=1, random_state=seed).reset_index(drop=True)

    assigned_blocks = []
    cursor = 0
    for _, row in unit_counts.iterrows():
        n = int(row['n_tasks'])
        # slice of tasks to assign to this unit
        tasks_slice = task_pool.iloc[cursor:cursor + n].copy().reset_index(drop=True)
        cursor += n

        # create a block with unit columns repeated for each assigned task
        unit_block = pd.DataFrame([row[unit_cols].to_dict()] * n)
        block = pd.concat([unit_block.reset_index(drop=True), tasks_slice.reset_index(drop=True)], axis=1)
        assigned_blocks.append(block)

    reshuffled = pd.concat(assigned_blocks, ignore_index=True)

    # Keep original column order
    reshuffled = reshuffled[df.columns]
    return reshuffled


def reshuffle_tasks_random_assignments(merged_df, seed, unit_cols):
    """
    Randomly reassign tasks to units (occupations) without preserving the
    original number of tasks per unit. This draws unit identities at random
    (with replacement) for each task, so unit counts will vary across the
    reshuffle.

    - merged_df: DataFrame with task rows and unit-identifying columns in unit_cols
    - seed: integer random seed
    - unit_cols: list of columns that define a unit

    Returns a DataFrame with the same columns as merged_df but unit columns
    reassigned randomly.
    """
    df = merged_df.copy()
    rng = np.random.RandomState(seed)

    # Build list of unique unit identity rows (dicts)
    unit_id_rows = df[unit_cols].drop_duplicates().to_dict(orient='records')
    n_units = len(unit_id_rows)
    if n_units == 0:
        # nothing to do
        return df

    # For each task row, sample a unit index (with replacement) so counts can vary
    sampled_idxs = rng.randint(0, n_units, size=len(df))
    sampled_units = [unit_id_rows[i] for i in sampled_idxs]
    sampled_df = pd.DataFrame(sampled_units)

    # Overwrite the unit columns in the copy
    for col in unit_cols:
        if col in sampled_df.columns:
            df[col] = sampled_df[col].values

    # Keep original column order
    df = df[merged_df.columns]
    return df

# Where to save per-seed outputs (will create seed-specific subfolders under this)
base_output_dir = output_data_path  # uses your notebook's variable by default
seed_root = os.path.join(base_output_dir, "seed_shuffles")
os.makedirs(seed_root, exist_ok=True)

# Option: preserve the number of tasks per unit (occupation) or not.
# Set to True to preserve counts (original behavior). Set to False to allow
# arbitrary reassignment of tasks to units (unit counts will change).
preserve_unit_counts = True  # <-- change this to False to use non-preserving reshuffle
print(f"Reshuffle mode: preserve_unit_counts={preserve_unit_counts}")

# ------------------ Run the full pipeline on the ORIGINAL (observed) dataset first and save under seed_0 ------------------
print("Running full observed (original) pipeline and saving outputs under seed_shuffles/seed_0 ...")
seed0_output = os.path.join(seed_root, 'seed_0')
os.makedirs(seed0_output, exist_ok=True)
_saved_output_data_path = globals().get('output_data_path', None)
globals()['output_data_path'] = seed0_output

# Run the same loops used elsewhere in the notebook to produce observed regressions and plots
for my_sector in bls_sector_levels:
    for my_onet_level, onet_occupation_code_var, onet_occupation_title_var in zip(onet_levels, onet_occupation_code_vars, onet_occupation_title_vars):
        if my_onet_level != 'detailed':
            continue

        occupation_analysis = create_occupation_analysis(my_sector, my_onet_level, merged_data, onet_occupation_code_var, onet_occupation_title_var)

        for weight_col, plot_title_suffix, plot_save_name_prefix in zip(weight_cols, plot_title_suffix_list, plot_save_name_prefix_list):
            for dependent_var, dependent_var_title, dependent_var_save_name_prefix in zip(dependent_var_list, dependent_var_title_list, dependent_var_save_name_prefix_list):
                master_df = merge_industry_employment_shares(0, my_sector, my_onet_level, dependent_var, onet_occupation_code_var, onet_occupation_title_var, weight_col, occupation_analysis)
                plot_weighted_regression_and_binned_scatter(my_sector, my_onet_level, dependent_var, dependent_var_title, dependent_var_save_name_prefix, master_df, weight_col, plot_title_suffix, plot_save_name_prefix)

        # run exposure vs AIability regression per weight_col (once per sector/onet level)
        for weight_col in weight_cols:
            regress_exposure_on_AIability(my_sector, my_onet_level, onet_occupation_code_var, onet_occupation_title_var, weight_col)

# Combine observed regression outputs generated under seed_0 into a master_regressions.csv inside seed_0/regressions
reg_files_seed0 = glob.glob(os.path.join(seed0_output, '**', 'reg_BLS*.csv'), recursive=True)
if len(reg_files_seed0) > 0:
    combined_obs = pd.concat([pd.read_csv(f) for f in reg_files_seed0], ignore_index=True)
    combined_obs = combined_obs.sort_values(by=['model', 'BLS_sector_level', 'ONET_level', 'weight_col'], ascending=True).reset_index(drop=True)
    combined_obs = combined_obs[(combined_obs['model'] == 'WLS') | (combined_obs['model'] == 'WLS_exposure_on_AIability')].reset_index(drop=True)
    if 'plot_prefix' in combined_obs.columns:
        combined_obs = combined_obs.drop(columns=['plot_prefix'])
    out_dir_obs = os.path.join(seed0_output, 'regressions')
    os.makedirs(out_dir_obs, exist_ok=True)
    master_file_obs = os.path.join(out_dir_obs, 'master_regressions.csv')
    combined_obs.to_csv(master_file_obs, index=False)
    print(f"Saved observed master_regressions under seed_0 -> {master_file_obs} ({len(combined_obs)} rows)")
else:
    print("Warning: No reg_BLS*.csv files found under seed_0 outputs to combine for the observed run.")

# restore original output_data_path if it existed (seed loop will override it per-seed later)
if _saved_output_data_path is None:
    globals().pop('output_data_path', None)
else:
    globals()['output_data_path'] = _saved_output_data_path
# ------------------ End observed-to-seed0 pipeline ------------------

# Now run reshuffles starting from seed 1
n_seeds = 1000
np.random.seed(42)  # for reproducibility of seed list
seeds = list(range(1, n_seeds + 1))

# Keep track of per-seed master_regressions file paths
seed_master_files = []

# Run the pipeline for each seed
for seed in tqdm(seeds, desc="Seeds"):
    try:
        reshuffled = merged_data.copy()
        # # 1) Build reshuffled merged_data for this seed
        # if preserve_unit_counts:
        #     reshuffled = reshuffle_tasks_preserve_unit_counts(merged_data, seed=seed, unit_cols=unit_cols)
        # else:
        #     reshuffled = reshuffle_tasks_random_assignments(merged_data, seed=seed, unit_cols=unit_cols)

        # 2) Temporarily redirect outputs to seed-specific folder so each seed's regressions don't collide
        seed_output_data_path = os.path.join(seed_root, f"seed_{seed}")
        if not os.path.exists(seed_output_data_path):
            os.makedirs(seed_output_data_path, exist_ok=True)
        else:
            continue  # skip already-done seeds

        # Save and restore original output_data_path after seed run
        orig_output_data_path = globals().get('output_data_path', None)
        globals()['output_data_path'] = seed_output_data_path

        # 3) Run the same analysis you do in the notebook, but on `reshuffled`
        #    We replicate the part of your main loop that creates occupation_analysis and then merges and runs regressions/plots.
        #    Keep to the same ONET level(s) and sector levels you use in the notebook.
        #    We'll follow the same loops you have. Adjust if you want fewer runs.
        for my_sector in bls_sector_levels:
            for my_onet_level, onet_occupation_code_var, onet_occupation_title_var in zip(onet_levels, onet_occupation_code_vars, onet_occupation_title_vars):
                if my_onet_level != 'detailed':
                    continue

                # Use reshuffled for occupation analysis
                occupation_analysis = create_occupation_analysis(my_sector, my_onet_level,
                                                                 reshuffled, onet_occupation_code_var, onet_occupation_title_var)

                for weight_col, plot_title_suffix, plot_save_name_prefix in zip(weight_cols, plot_title_suffix_list, plot_save_name_prefix_list):
                    for dependent_var, dependent_var_title, dependent_var_save_name_prefix in zip(dependent_var_list, dependent_var_title_list, dependent_var_save_name_prefix_list):
                        master_df = merge_industry_employment_shares(seed,
                                                                     my_sector,
                                                                    my_onet_level,
                                                                    dependent_var,
                                                                    onet_occupation_code_var, onet_occupation_title_var,
                                                                    weight_col,
                                                                    occupation_analysis)

                        plot_weighted_regression_and_binned_scatter(my_sector, my_onet_level,
                                                                   dependent_var, dependent_var_title, dependent_var_save_name_prefix,
                                                                   master_df, weight_col,
                                                                   plot_title_suffix, plot_save_name_prefix)
                    # run exposure vs AIability regression per weight_col
                    regress_exposure_on_AIability(my_sector, my_onet_level,
                                                  onet_occupation_code_var, onet_occupation_title_var,
                                                  weight_col)

        # 4) After finishing seed runs, run the same combining code you have that creates master_regressions.csv
        #    (Your notebook's combining code expects variables my_onet_level & my_sector from the last loop; to be safe, we'll recompute and call it similarly)
        # We'll create combined master_regressions within the seed folder:
        # reuse your combining logic but pointing at this seed's output folder
        try:
            # try to find all reg files under this seed output folder
            reg_files = glob.glob(os.path.join(seed_output_data_path, '**', 'reg_BLS*.csv'), recursive=True)
            if len(reg_files) == 0:
                print(f"[seed {seed}] No reg files found under {seed_output_data_path}; skipping combine.")
            else:
                combined = pd.concat([pd.read_csv(f) for f in reg_files], ignore_index=True)
                # Apply the same cleaning/sorting you do in the notebook
                combined = combined.sort_values(by=['model', 'BLS_sector_level', 'ONET_level', 'weight_col'], ascending=True).reset_index(drop=True)
                combined = combined[(combined['model'] == 'WLS') | (combined['model'] == 'WLS_exposure_on_AIability')].reset_index(drop=True)
                # drop model and unneeded cols if present
                if 'plot_prefix' in combined.columns:
                    combined = combined.drop(columns=['plot_prefix'])
                combined = combined[combined['ONET_level'] != 'major'] if 'ONET_level' in combined.columns else combined

                out_dir = os.path.join(seed_output_data_path, 'regressions')
                os.makedirs(out_dir, exist_ok=True)
                master_file = os.path.join(out_dir, 'master_regressions.csv')
                combined.to_csv(master_file, index=False)
                seed_master_files.append(master_file)
                if seed % 50 == 0:
                    print(f"[seed {seed}] Combined regressions -> {master_file}")
        except Exception as e:
            print(f"[seed {seed}] Failed to combine regression files: {e}")

    except Exception as e:
        print(f"[seed {seed}] ERROR during seed processing: {e}")
    # finally:
    #     # restore original output_data_path global
    #     if orig_output_data_path is None:
    #         globals().pop('output_data_path', None)
    #     else:
    #         globals()['output_data_path'] = orig_output_data_path

# ---- Collect coefficients across seeds and plot histograms (one plot per regression id) ----

# Prefer the seed_0 observed master file if it exists, otherwise look for any observed master not in seed_shuffles
seed0_master = os.path.join(seed_root, 'seed_0', 'regressions', 'master_regressions.csv')
if os.path.exists(seed0_master):
    observed_master_file = seed0_master
else:
    observed_master_candidates = glob.glob(os.path.join(base_output_dir, '**', 'master_regressions.csv'), recursive=True)
    # prefer one that is not in the seed_shuffles folder
    observed_master_candidates = [p for p in observed_master_candidates if 'seed_shuffles' not in p]
    observed_master_file = observed_master_candidates[0] if len(observed_master_candidates) > 0 else None

if observed_master_file is None:
    print("WARNING: Could not find an observed master_regressions.csv. Observed value will not be plotted.")
else:
    observed_df = pd.read_csv(observed_master_file)
    print(f"Found observed master file: {observed_master_file} ({len(observed_df)} rows)")

# Load per-seed master files into a single DataFrame with 'seed' column
seed_dfs = []
for seed in seeds:
    f = os.path.join(seed_root, f"seed_{seed}", "regressions", "master_regressions.csv")
    if os.path.exists(f):
        try:
            d = pd.read_csv(f)
            d['seed'] = seed
            seed_dfs.append(d)
        except Exception as e:
            print(f"[seed {seed}] failed to read {f}: {e}")
    else:
        # don't spam if many missing - print only occasionally
        print(f"[seed {seed}] master_regressions.csv not found at expected path {f}")

if len(seed_dfs) == 0:
    raise RuntimeError("No per-seed master_regressions found; aborting histogram plotting. Check that the seed runs produced regressions.")

all_seeds_df = pd.concat(seed_dfs, ignore_index=True)

# Identify coefficient column (try the commonly used names first)
# look in the per-seed combined DataFrame first, then observed if needed
coef_col = None
candidates = []
if 'coef_num_tasks' in all_seeds_df.columns:
    candidates.append('coef_num_tasks')
if 'coef_human_E1_fraction' in all_seeds_df.columns:
    candidates.append('coef_human_E1_fraction')
if len(candidates) == 0:
    candidates = [c for c in all_seeds_df.columns if str(c).startswith('coef')]
if len(candidates) == 0 and 'observed_df' in locals():
    # try observed file as a last resort
    candidates = [c for c in observed_df.columns if str(c).startswith('coef')]
if len(candidates) == 0:
    raise RuntimeError("Could not find a coefficient column in per-seed master_regressions or observed master. Look for 'coef_num_tasks' or other 'coef_*' columns.")
coef_col = candidates[0]

# Determine keys that uniquely identify a regression entry (use string names)
candidate_keys = ['BLS_sector_level', 'ONET_level', 'dependent_var', 'weight_col']
# prefer keys present in the per-seed dataframe
key_cols = [c for c in candidate_keys if c in all_seeds_df.columns]
# if observed exists and provides a better set of keys, prefer intersection that is present in both
if 'observed_df' in locals():
    obs_keys = [c for c in candidate_keys if c in observed_df.columns]
    if len(obs_keys) > 0:
        inter = [c for c in candidate_keys if c in obs_keys and c in all_seeds_df.columns]
        if len(inter) > 0:
            key_cols = inter
# final fallback: keep any sensible keys present in all_seeds_df
if len(key_cols) == 0:
    key_cols = [c for c in ['dependent_var', 'weight_col'] if c in all_seeds_df.columns]

print(f"Using coef column '{coef_col}' and keys {key_cols} to identify regressions.")

Reshuffle mode: preserve_unit_counts=True
Running full observed (original) pipeline and saving outputs under seed_shuffles/seed_0 ...
Saved observed master_regressions under seed_0 -> ../data/computed_objects/BLS_ONET_matchedEmpShares/seed_shuffles/seed_0/regressions/master_regressions.csv (9 rows)


Seeds: 100%|██████████| 1000/1000 [00:07<00:00, 128.10it/s]


Found observed master file: ../data/computed_objects/BLS_ONET_matchedEmpShares/seed_shuffles/seed_0/regressions/master_regressions.csv (9 rows)
Using coef column 'coef_num_tasks' and keys ['BLS_sector_level', 'ONET_level', 'dependent_var', 'weight_col'] to identify regressions.


In [125]:
# Prepare output dir for combined histograms
hist_out_dir = os.path.join(base_output_dir, "shuffle_histograms")
os.makedirs(hist_out_dir, exist_ok=True)

# Prefer regressions present in observed master if available, otherwise use union across seeds
if len(key_cols) == 0:
    # No clear keys available: fall back to plotting one histogram aggregating all seed coefficients
    unique_keys_df = pd.DataFrame([{}])
else:
    if 'observed_df' in locals():
        # Use observed regressions if possible (safer to plot what was actually run)
        obs_keys_present = [c for c in key_cols if c in observed_df.columns]
        if len(obs_keys_present) == len(key_cols):
            unique_keys_df = observed_df[key_cols].drop_duplicates().reset_index(drop=True)
        else:
            # observed missing some key columns: fall back to union across seeds
            unique_keys_df = all_seeds_df[key_cols].drop_duplicates().reset_index(drop=True)
    else:
        unique_keys_df = all_seeds_df[key_cols].drop_duplicates().reset_index(drop=True)

plots_created = 0
for idx, key_row in unique_keys_df.iterrows():
    # build boolean mask across all seeds; if no key_cols, this stays all True
    mask = np.ones(len(all_seeds_df), dtype=bool)
    label_parts = []
    for col in key_cols:
        # key_row may be an empty dict when key_cols == []
        val = key_row[col]
        mask &= (all_seeds_df[col] == val)
        label_parts.append(f"{val}")

    if len(key_cols) == 0:
        regression_id = "all_regressions"
    else:
        regression_id = "__".join([f"{col}-{str(key_row[col])}" for col in key_cols])

    coef_vals = all_seeds_df.loc[mask, coef_col].dropna().astype(float).values

    if coef_vals.size == 0:
        print(f"Skipping {regression_id}: no coef values found across seeds.")
        continue

    # Plot single histogram that aggregates coefficients from all seeds for this regression
    plt.figure(figsize=(9, 5))
    plt.hist(coef_vals, bins=min(30, max(5, int(len(coef_vals) / 2))), color='orange', edgecolor='k', alpha=0.9)
    plt.xlabel(coef_col)
    plt.ylabel('Count')
    title = f"{len(coef_vals)} Reshuffles: {' | '.join(label_parts) if len(label_parts)>0 else 'all'}\n\n(Randomized Weights, Fixed Task Assignment)"
    plt.title(title)

    # If observed data exists, overlay the observed coefficient as a dashed red line
    if 'observed_df' in locals():
        # if we have key_cols, try to match observed rows; otherwise use any observed coef available
        if len(key_cols) > 0:
            mask_obs = np.ones(len(observed_df), dtype=bool)
            for col in key_cols:
                if col in observed_df.columns:
                    mask_obs &= (observed_df[col] == key_row[col])
                else:
                    # cannot match on this key in observed; set mask_obs all False to skip
                    mask_obs &= False
            obs_series = observed_df.loc[mask_obs, coef_col].dropna().astype(float) if mask_obs.any() else pd.Series(dtype=float)
        else:
            obs_series = observed_df[coef_col].dropna().astype(float) if coef_col in observed_df.columns else pd.Series(dtype=float)

        if len(obs_series) > 0:
            obs_val = float(obs_series.iloc[0])
            plt.axvline(obs_val, color='red', linestyle='--', lw=2, label='Observed')
            plt.axvline(0, color='black', linestyle='--', lw=1.5)
            # annotate percentile: how many seeds are below the observed value
            percentile = (coef_vals < obs_val).mean() * 100.0
            plt.legend(loc = 'upper right', title=f'Observed (pct below: {percentile:.1f}% )')
            plt.xlim(-0.004, 0.004)

    out_file = os.path.join(hist_out_dir, f"hist_{regression_id}.png")
    plt.tight_layout()
    plt.savefig(out_file, dpi=300)
    plt.close()
    plots_created += 1
    print(f"Saved histogram for {regression_id} -> {out_file}")

print(f"Done: created {plots_created} histogram(s). Per-seed master_regressions (if produced) were stored under:\n  {seed_root}")
print(f"Histograms saved under:\n  {hist_out_dir}")

# ---- End cell ----

Saved histogram for BLS_sector_level-sector__ONET_level-detailed__dependent_var-human_E1_fraction__weight_col-occ_sectorEmpShare -> ../data/computed_objects/BLS_ONET_matchedEmpShares/shuffle_histograms/hist_BLS_sector_level-sector__ONET_level-detailed__dependent_var-human_E1_fraction__weight_col-occ_sectorEmpShare.png
Saved histogram for BLS_sector_level-sector__ONET_level-detailed__dependent_var-ai_fraction__weight_col-occ_sectorEmpShare -> ../data/computed_objects/BLS_ONET_matchedEmpShares/shuffle_histograms/hist_BLS_sector_level-sector__ONET_level-detailed__dependent_var-ai_fraction__weight_col-occ_sectorEmpShare.png
Saved histogram for BLS_sector_level-sector__ONET_level-detailed__dependent_var-ai_fraction__weight_col-occ_totalEmpShare -> ../data/computed_objects/BLS_ONET_matchedEmpShares/shuffle_histograms/hist_BLS_sector_level-sector__ONET_level-detailed__dependent_var-ai_fraction__weight_col-occ_totalEmpShare.png
Saved histogram for BLS_sector_level-sector__ONET_level-detailed__