#### By: Peyman Shahidi
#### Created: Oct 29, 2025
#### Last Edit: Nov 2, 2025

<br>

In [158]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [159]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
# Modify the output path accordingly
output_data_path = f'{input_data_path}/computed_objects/BLS_ONET_matchedEmpShares'
output_plot_path = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares"
output_plot_path_by_BLS_sector = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares/by_BLS_sector"
output_plot_path_by_ONET_level = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares/by_ONET_level"
output_plot_path_by_weighting_scheme = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares/by_weighting_scheme"
output_plot_path_by_dependent_var = f"{main_folder_path}/writeup/plots/anthropic_AI_index/BLS_ONET_matchedEmpShares/by_dependent_var"

# Toggle: if True, randomly reassign occ_totalEmpShare weights in the merged master_df
# during the merge_industry_employment_shares step. Set to False for default behavior.
randomize_occ_weights = True


In [160]:
import os
for path in [output_data_path, output_plot_path, 
             output_plot_path_by_BLS_sector, output_plot_path_by_ONET_level, 
             output_plot_path_by_weighting_scheme, output_plot_path_by_dependent_var]:
    if not os.path.exists(path):
        os.makedirs(path)

In [161]:
# Read the merged data
merged_data = pd.read_csv(f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv")

In [162]:
# # Drop the supplemental tasks
# merged_data = merged_data[merged_data['Task Type'] != 'Supplemental'].reset_index(drop=True)

# # Drop rows whose Occupation Title includes 'Teachers, Postsecondary'
# merged_data = merged_data[~merged_data['Occupation Title'].str.contains('Teachers, Postsecondary')].reset_index(drop=True)

## Functions

In [163]:
def create_occupation_analysis(my_sector, my_onet_level,
                               merged_data, onet_occupation_code_var, onet_occupation_title_var):
    # Create occupation-level analysis for scatter plots
    # Group by occupation and calculate label fractions and task counts
    occupation_stats = []

    for (soc_code, occ_title), group in merged_data.groupby([onet_occupation_code_var, onet_occupation_title_var]):
        num_tasks = group['Task ID'].nunique()
        # num_occupations = group[onet_occupation_code_var].nunique()
        total_tasks = len(group)
        
        manual_fraction = (group['label'] == 'Manual').sum() / total_tasks
        augmentation_fraction = (group['label'] == 'Augmentation').sum() / total_tasks  
        automation_fraction = (group['label'] == 'Automation').sum() / total_tasks
        ai_fraction = augmentation_fraction + automation_fraction
        gpt4_E0_fraction = (group['gpt4_exposure'] == 'E0').sum() / total_tasks
        gpt4_E1_fraction = (group['gpt4_exposure'] == 'E1').sum() / total_tasks
        gpt4_E2_fraction = (group['gpt4_exposure'] == 'E2').sum() / total_tasks
        gpt4_aiExposure_fraction = gpt4_E1_fraction + gpt4_E2_fraction
        human_E0_fraction = (group['human_labels'] == 'E0').sum() / total_tasks
        human_E1_fraction = (group['human_labels'] == 'E1').sum() / total_tasks
        human_E2_fraction = (group['human_labels'] == 'E2').sum() / total_tasks
        human_aiExposure_fraction = human_E1_fraction + human_E2_fraction

        
        occupation_stats.append({
            f'{onet_occupation_code_var}': soc_code,
            f'{onet_occupation_title_var}': occ_title,
            'num_tasks': num_tasks,
            # 'num_occupations': num_occupations,
            'manual_fraction': manual_fraction,
            'ai_fraction': ai_fraction,
            'augmentation_fraction': augmentation_fraction,
            'automation_fraction': automation_fraction,
            'gpt4_E0_fraction': gpt4_E0_fraction,
            'gpt4_E1_fraction': gpt4_E1_fraction,
            'gpt4_E2_fraction': gpt4_E2_fraction,
            'gpt4_aiExposure_fraction': gpt4_aiExposure_fraction,
            'human_E0_fraction': human_E0_fraction,
            'human_E1_fraction': human_E1_fraction,
            'human_E2_fraction': human_E2_fraction,
            'human_aiExposure_fraction': human_aiExposure_fraction
        })

    occupation_analysis = pd.DataFrame(occupation_stats)

    return occupation_analysis



def merge_industry_employment_shares_master(seed,
                                            my_sector, my_onet_level,
                                            onet_occupation_code_var, onet_occupation_title_var,
                                            weight_cols,
                                            occupation_analysis):


    # Merge industry employment share weights for all NAICS sectors and create a master dataset
    bls_sector_shares = pd.read_csv(f'{input_data_path}/computed_objects/BLS_ONET_empShares/bls_{my_sector}_ONET{my_onet_level}_empShares.csv')

    # Ensure NAICS is string for consistent merging
    bls_sector_shares['NAICS'] = bls_sector_shares['NAICS'].astype(str)

    # Start with occupation_analysis as output_df
    output_df = occupation_analysis.copy()

    # For each weighting scheme, compute the weights and merge into output_df
    # 1) occ_totalEmpShare
    if 'occ_totalEmpShare' in weight_cols:
        bls_occ_totalEmpShares = bls_sector_shares[['OCC_CODE', 'TOT_EMP']].groupby('OCC_CODE').sum().reset_index()

        # Convert to % and change variable name
        bls_occ_totalEmpShares['TOT_EMP'] = bls_occ_totalEmpShares['TOT_EMP'] / bls_occ_totalEmpShares['TOT_EMP'].sum()
        bls_occ_totalEmpShares = bls_occ_totalEmpShares.rename(columns={'TOT_EMP': 'occ_totalEmpShare'})

        # Add weight column to output_df
        output_df = output_df.merge(bls_occ_totalEmpShares, left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')

    # 2) sectorEmpShare
    if 'sectorEmpShare' in weight_cols:
        sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'TOT_EMP']].groupby(['NAICS', 'NAICS_TITLE']).sum('TOT_EMP')

        # Convert to % and change variable name
        sector_weights_df['TOT_EMP'] = sector_weights_df['TOT_EMP'] / sector_weights_df['TOT_EMP'].sum()
        sector_weights_df = sector_weights_df.rename(columns={'TOT_EMP': 'sectorEmpShare'})

        # Merge back sector weights to bls dataset to get sector-by-sector occupation data with sector weights
        bls_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE']].merge(sector_weights_df, on='NAICS', how='left')

        
        # Aggregate sector weights across occupations
        bls_sector_weights_occupation_df = bls_sector_weights_df.groupby('OCC_CODE').sum('sectorEmpShare')

        # Add weight column to output_df
        output_df = output_df.merge(bls_sector_weights_occupation_df, left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')

    # 3) occ_sectorEmpShare
    if 'occ_sectorEmpShare' in weight_cols:
        within_sector_weights_df = bls_sector_shares[['NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP']].copy()
        within_sector_weights_df['occ_sectorEmpShare'] = within_sector_weights_df['TOT_EMP'] / within_sector_weights_df.groupby(['NAICS', 'NAICS_TITLE'])['TOT_EMP'].transform('sum')

        # Calculate sum over all sectors
        within_sector_weights = within_sector_weights_df[['OCC_CODE', 'OCC_TITLE', 'occ_sectorEmpShare']].groupby(['OCC_CODE', 'OCC_TITLE']).sum()

        # Add weight column to output_df
        output_df = output_df.merge(within_sector_weights, left_on=onet_occupation_code_var, right_on=['OCC_CODE'], how='left')

    # Drop the 'OCC_CODE' column
    output_df = output_df.drop(columns='OCC_CODE')

    # Save master dataframe to CSV
    output_df.to_csv(f'{output_data_path}/BLS{my_sector}_ONET{my_onet_level}.csv', index=False)

    return output_df




def run_weighted_regression(my_sector, my_onet_level,
                           master_df, 
                           dependent_vars,
                           weight_cols):

    results = []

    for weight_col in weight_cols:
        for dependent_var in dependent_vars:
            # Data prep
            df = master_df[[dependent_var, 'num_tasks', weight_col]].copy()
            df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=[dependent_var, 'num_tasks', weight_col])

            # Coerce numeric
            df['num_tasks'] = pd.to_numeric(df['num_tasks'], errors='coerce')
            df[dependent_var] = pd.to_numeric(df[dependent_var], errors='coerce')
            df[weight_col] = pd.to_numeric(df[weight_col], errors='coerce')

            if df.empty:
                continue

            # Regression
            model_wls = smf.wls(f'{dependent_var} ~ num_tasks', data=df, weights=df[weight_col]).fit(cov_type='HC3')

            # Store results
            results.append({
                'BLS_sector_level': my_sector,
                'ONET_level': my_onet_level,
                'dependent_var': dependent_var,
                'weight_col': weight_col,
                'model': 'WLS',
                'coef_num_tasks': float(model_wls.params.get('num_tasks', np.nan)),
                'std_err': float(model_wls.bse.get('num_tasks', np.nan)),
                'pvalue': float(model_wls.pvalues.get('num_tasks', np.nan)),
                'n_obs': int(model_wls.nobs)
            })

    reg_df = pd.DataFrame(results)

    reg_out_dir = f"{output_data_path}/regressions"
    os.makedirs(reg_out_dir, exist_ok=True)
    out_file = f"{reg_out_dir}/reg_BLS{my_sector}_ONET{my_onet_level}.csv"
    reg_df.to_csv(out_file, index=False)




def regress_exposure_on_AIability(my_sector, my_onet_level,
                                  master_df,
                                  weight_cols,
                                  dependent_var='ai_fraction',
                                  regressor='human_E1_fraction'):
    """
    For each weight column, run WLS: ai_fraction ~ human_E1_fraction + num_tasks,
    collect the coefficient on human_E1_fraction, and save all results in one CSV.
    Returns the results DataFrame.
    """

    results = []

    for weight_col in weight_cols:
        # Clean sample for this regression
        df = master_df[[dependent_var, regressor, 'num_tasks', weight_col]].copy()
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.dropna(subset=[dependent_var, regressor, 'num_tasks', weight_col])

        model = smf.wls(
            f'{dependent_var} ~ {regressor} + num_tasks',
            data=df,
            weights=df[weight_col]
        ).fit(cov_type='HC3')

        results.append({
            'BLS_sector_level': my_sector,
            'ONET_level': my_onet_level,
            'weight_col': weight_col,
            'model': 'WLS_AIability_on_exposure',
            'coef_human_E1_fraction': float(model.params.get(regressor, np.nan)),
            'std_err': float(model.bse.get(regressor, np.nan)),
            'pvalue': float(model.pvalues.get(regressor, np.nan)),
            'n_obs': int(model.nobs),
        })

    res_df = pd.DataFrame(results)

    # Save one CSV with all weight cols
    out_file = f"{output_data_path}/reg_BLS{my_sector}_ONET{my_onet_level}_AIability_on_exposure.csv"
    res_df.to_csv(out_file, index=False)

In [None]:
bls_sector_levels = ['sector']#, '3-digit', '4-digit', '5-digit', '6-digit']

onet_levels = ['major', 'minor', 'broad', 'detailed']
onet_occupation_code_vars = ['Major_Group_Code', 'Minor_Group_Code', 'Broad_Occupation_Code', 'Detailed_Occupation_Code']
onet_occupation_title_vars = ['Major_Group_Title', 'Minor_Group_Title', 'Broad_Occupation_Title', 'Detailed_Occupation_Title']

weight_cols = ['occ_totalEmpShare',         # Weight each occupation by occupation's share of total employment (ignoring sector shares)
               'sectorEmpShare',            # Weight each occupation by its sector's share of total employment
               'occ_sectorEmpShare']        # Weight each occupation by its share of employment within its sector and weight sectors equally  
# plot_title_suffix_list = ['Weighted by Occupation Share of Total Employment',
#                           'Weighted by Sector Share of Total Employment',
#                           'Weighted by Occupation Employment Share of Sector']
# plot_save_name_prefix_list = ['occupationEmpShareWeights',
#                               'sectorEmpShareWeights',
#                               'occupationEmpShareWithinSectorWeights']

dependent_var_list = ['ai_fraction', 'human_E1_fraction']#, 'human_aiExposure_fraction']#, 'gpt4_E1_fraction']
# dependent_var_title_list = ['Fraction of AI Tasks (Anthropic)', r'Fraction of Human $\alpha$ Exposure (Eloundou et al.)']#, r'Fraction of Human \gamma AI Exposure (Eloundou et al.)']#, r'Fraction of GPT-4 $\alpha$ Exposure (Eloundou et al.)']
# dependent_var_save_name_prefix_list = ['aiFraction', 'humanAiExposureFraction']#, 'humanAiExposureFraction']#, 'gpt4AiExposureFraction']


# Run the analysis for each BLS sector level and ONET occupation level
for my_sector in bls_sector_levels:
    for my_onet_level, onet_occupation_code_var, onet_occupation_title_var in zip(onet_levels, onet_occupation_code_vars, onet_occupation_title_vars):
        if my_onet_level != 'detailed':
                    continue
        
        # Get occupation data
        occupation_analysis = create_occupation_analysis(my_sector, my_onet_level,
                                                         merged_data, onet_occupation_code_var, onet_occupation_title_var)

        # Get master dataframe with all weighting schemes merged
        master_df = merge_industry_employment_shares_master(0,
                                                            my_sector, my_onet_level,
                                                            onet_occupation_code_var, onet_occupation_title_var,
                                                            weight_cols,
                                                            occupation_analysis)
        
        # Regress for different dependent vars and weighting schemes
        run_weighted_regression(my_sector, my_onet_level,
                                master_df, 
                                dependent_var_list,
                                weight_cols)
        
        # Regress exposure on AI ability
        regress_exposure_on_AIability(my_sector, my_onet_level,
                                    master_df,
                                    weight_cols,
                                    dependent_var='ai_fraction',
                                    regressor='human_E1_fraction')

In [None]:
aa

In [None]:
# After the loops: combine all per-iteration regression outputs into a master file
import os, glob
paths_list = ['regressions', f'regression_BLS{my_sector}_ONET{my_onet_level}']
for path in paths_list:
    reg_out_dir = f'{output_data_path}/{path}'
    os.makedirs(reg_out_dir, exist_ok=True)
    reg_files = glob.glob(os.path.join(reg_out_dir, 'reg_BLS*.csv'))

    combined = pd.concat([pd.read_csv(f) for f in reg_files], ignore_index=True)
    combined = combined.sort_values(by=['model', 'BLS_sector_level', 'ONET_level', 'weight_col'], ascending=True).reset_index(drop=True)

    combined = combined[(combined['model'] == 'WLS') | (combined['model'] == 'WLS_exposure_on_AIability')].reset_index(drop=True)
    combined = combined.drop(columns=['model'])

    try:
        # Drop unimportant entries
        combined = combined.drop(columns=['plot_prefix'])
        combined = combined[combined['ONET_level'] != 'major']
        # combined = combined[(combined['weight_col'] != 'ONE') & (combined['weight_col'] != 'occ_sectorEmpShare')].reset_index(drop=True)

        # Sort order of entries
        # Create mapping dicts
        dependent_var_map = {'ai_fraction': 0, 'gpt4_E1_fraction': 2, 'human_E1_fraction': 1}
        bls_map = {v: i for i, v in enumerate(bls_sector_levels)}
        onet_map = {v: i for i, v in enumerate(onet_levels)}
        weight_map = {v: i for i, v in enumerate(weight_cols)}

        # Sort by all three with different mappings
        combined = combined.sort_values(
            by=["weight_col", "dependent_var", "BLS_sector_level", "ONET_level"],
            key=lambda col: (
                col.map(dependent_var_map) if col.name == "dependent_var" else
                col.map(weight_map) if col.name == "weight_col" else
                col.map(bls_map) if col.name == "BLS_sector_level" else
                col.map(onet_map)
            )
        ).reset_index(drop=True)

        master_file = os.path.join(reg_out_dir, 'master_regressions.csv')
        combined.to_csv(master_file, index=False)
        print(f"Combined {len(reg_files)} regression files into {master_file}")
    except Exception as e:
        master_file = os.path.join(reg_out_dir, 'master_regressions.csv')
        combined.to_csv(master_file, index=False)
        print(f"Combined {len(reg_files)} regression files into {master_file}")

    

Combined 6 regression files into ../data/computed_objects/BLS_ONET_matchedEmpShares/regressions/master_regressions.csv
Combined 3 regression files into ../data/computed_objects/BLS_ONET_matchedEmpShares/regression_BLSsector_ONETdetailed/master_regressions.csv


## Placebo Test: Reshuffle Task-Occupation Assignment and Repeat the Same Analysis

In [None]:
# ---- Begin: 100-seed reshuffle + analysis ----
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# List of unit columns exactly as you specified
unit_cols = [
    "O*NET-SOC Code",
    "Occupation Title",
    "Base_SOC_Code",
    "Major_Group_Code",
    "Major_Group_Title",
    "Minor_Group_Code",
    "Minor_Group_Title",
    "Broad_Occupation_Code",
    "Broad_Occupation_Title",
    "Detailed_Occupation_Code",
    "Detailed_Occupation_Title"
]

def reshuffle_tasks_preserve_unit_counts(merged_df, seed, unit_cols):
    """
    Shuffle task records across units while preserving each unit's number of tasks.
    - unit_cols: list of columns that define a unit (will remain as unit identity).
    - All other columns (including 'Task ID' and 'Task Title' and other task properties)
      are considered task properties and move with the task to a new unit.
    Returns a new reshuffled DataFrame with same columns and same number of rows.
    """
    # Defensive copy
    df = merged_df.copy()

    # Determine task/property columns = all columns except unit columns
    task_columns = [c for c in df.columns if c not in unit_cols]

    # Compute unit-level counts (preserve order)
    unit_counts = df.groupby(unit_cols, sort=False).size().reset_index(name='n_tasks')

    # Extract the task pool (task properties only)
    task_pool = df[task_columns].sample(frac=1, random_state=seed).reset_index(drop=True)

    assigned_blocks = []
    cursor = 0
    for _, row in unit_counts.iterrows():
        n = int(row['n_tasks'])
        # slice of tasks to assign to this unit
        tasks_slice = task_pool.iloc[cursor:cursor + n].copy().reset_index(drop=True)
        cursor += n

        # create a block with unit columns repeated for each assigned task
        unit_block = pd.DataFrame([row[unit_cols].to_dict()] * n)
        block = pd.concat([unit_block.reset_index(drop=True), tasks_slice.reset_index(drop=True)], axis=1)
        assigned_blocks.append(block)

    reshuffled = pd.concat(assigned_blocks, ignore_index=True)

    # Keep original column order
    reshuffled = reshuffled[df.columns]
    return reshuffled


def reshuffle_tasks_random_assignments(merged_df, seed, unit_cols):
    """
    Randomly reassign tasks to units (occupations) without preserving the
    original number of tasks per unit. This draws unit identities at random
    (with replacement) for each task, so unit counts will vary across the
    reshuffle.

    - merged_df: DataFrame with task rows and unit-identifying columns in unit_cols
    - seed: integer random seed
    - unit_cols: list of columns that define a unit

    Returns a DataFrame with the same columns as merged_df but unit columns
    reassigned randomly.
    """
    df = merged_df.copy()
    rng = np.random.RandomState(seed)

    # Build list of unique unit identity rows (dicts)
    unit_id_rows = df[unit_cols].drop_duplicates().to_dict(orient='records')
    n_units = len(unit_id_rows)
    if n_units == 0:
        # nothing to do
        return df

    # For each task row, sample a unit index (with replacement) so counts can vary
    sampled_idxs = rng.randint(0, n_units, size=len(df))
    sampled_units = [unit_id_rows[i] for i in sampled_idxs]
    sampled_df = pd.DataFrame(sampled_units)

    # Overwrite the unit columns in the copy
    for col in unit_cols:
        if col in sampled_df.columns:
            df[col] = sampled_df[col].values

    # Keep original column order
    df = df[merged_df.columns]
    return df

# Where to save per-seed outputs (will create seed-specific subfolders under this)
base_output_dir = output_data_path  # uses your notebook's variable by default
seed_root = os.path.join(base_output_dir, "seed_shuffles")
os.makedirs(seed_root, exist_ok=True)

# Option: preserve the number of tasks per unit (occupation) or not.
# Set to True to preserve counts (original behavior). Set to False to allow
# arbitrary reassignment of tasks to units (unit counts will change).
preserve_unit_counts = True  # <-- change this to False to use non-preserving reshuffle
print(f"Reshuffle mode: preserve_unit_counts={preserve_unit_counts}")

# ------------------ Run the full pipeline on the ORIGINAL (observed) dataset first and save under seed_0 ------------------
print("Running full observed (original) pipeline and saving outputs under seed_shuffles/seed_0 ...")
seed0_output = os.path.join(seed_root, 'seed_0')
os.makedirs(seed0_output, exist_ok=True)
_saved_output_data_path = globals().get('output_data_path', None)
globals()['output_data_path'] = seed0_output

# Run the same loops used elsewhere in the notebook to produce observed regressions and plots
for my_sector in bls_sector_levels:
    for my_onet_level, onet_occupation_code_var, onet_occupation_title_var in zip(onet_levels, onet_occupation_code_vars, onet_occupation_title_vars):
        if my_onet_level != 'detailed':
            continue

        occupation_analysis = create_occupation_analysis(my_sector, my_onet_level, merged_data, onet_occupation_code_var, onet_occupation_title_var)

        for weight_col, plot_title_suffix, plot_save_name_prefix in zip(weight_cols, plot_title_suffix_list, plot_save_name_prefix_list):
            for dependent_var, dependent_var_title, dependent_var_save_name_prefix in zip(dependent_var_list, dependent_var_title_list, dependent_var_save_name_prefix_list):
                master_df = merge_industry_employment_shares(0, my_sector, my_onet_level, dependent_var, onet_occupation_code_var, onet_occupation_title_var, weight_col, occupation_analysis)
                plot_weighted_regression_and_binned_scatter(my_sector, my_onet_level, dependent_var, dependent_var_title, dependent_var_save_name_prefix, master_df, weight_col, plot_title_suffix, plot_save_name_prefix)

        # run exposure vs AIability regression per weight_col (once per sector/onet level)
        for weight_col in weight_cols:
            regress_exposure_on_AIability(my_sector, my_onet_level, onet_occupation_code_var, onet_occupation_title_var, weight_col)

# Combine observed regression outputs generated under seed_0 into a master_regressions.csv inside seed_0/regressions
reg_files_seed0 = glob.glob(os.path.join(seed0_output, '**', 'reg_BLS*.csv'), recursive=True)
if len(reg_files_seed0) > 0:
    combined_obs = pd.concat([pd.read_csv(f) for f in reg_files_seed0], ignore_index=True)
    combined_obs = combined_obs.sort_values(by=['model', 'BLS_sector_level', 'ONET_level', 'weight_col'], ascending=True).reset_index(drop=True)
    combined_obs = combined_obs[(combined_obs['model'] == 'WLS') | (combined_obs['model'] == 'WLS_exposure_on_AIability')].reset_index(drop=True)
    if 'plot_prefix' in combined_obs.columns:
        combined_obs = combined_obs.drop(columns=['plot_prefix'])
    out_dir_obs = os.path.join(seed0_output, 'regressions')
    os.makedirs(out_dir_obs, exist_ok=True)
    master_file_obs = os.path.join(out_dir_obs, 'master_regressions.csv')
    combined_obs.to_csv(master_file_obs, index=False)
    print(f"Saved observed master_regressions under seed_0 -> {master_file_obs} ({len(combined_obs)} rows)")
else:
    print("Warning: No reg_BLS*.csv files found under seed_0 outputs to combine for the observed run.")

# restore original output_data_path if it existed (seed loop will override it per-seed later)
if _saved_output_data_path is None:
    globals().pop('output_data_path', None)
else:
    globals()['output_data_path'] = _saved_output_data_path
# ------------------ End observed-to-seed0 pipeline ------------------

# Now run reshuffles starting from seed 1
n_seeds = 1000
np.random.seed(42)  # for reproducibility of seed list
seeds = list(range(1, n_seeds + 1))

# Keep track of per-seed master_regressions file paths
seed_master_files = []

# Run the pipeline for each seed
for seed in tqdm(seeds, desc="Seeds"):
    try:
        reshuffled = merged_data.copy()
        # # 1) Build reshuffled merged_data for this seed
        # if preserve_unit_counts:
        #     reshuffled = reshuffle_tasks_preserve_unit_counts(merged_data, seed=seed, unit_cols=unit_cols)
        # else:
        #     reshuffled = reshuffle_tasks_random_assignments(merged_data, seed=seed, unit_cols=unit_cols)

        # 2) Temporarily redirect outputs to seed-specific folder so each seed's regressions don't collide
        seed_output_data_path = os.path.join(seed_root, f"seed_{seed}")
        if not os.path.exists(seed_output_data_path):
            os.makedirs(seed_output_data_path, exist_ok=True)
        else:
            continue  # skip already-done seeds

        # Save and restore original output_data_path after seed run
        orig_output_data_path = globals().get('output_data_path', None)
        globals()['output_data_path'] = seed_output_data_path

        # 3) Run the same analysis you do in the notebook, but on `reshuffled`
        #    We replicate the part of your main loop that creates occupation_analysis and then merges and runs regressions/plots.
        #    Keep to the same ONET level(s) and sector levels you use in the notebook.
        #    We'll follow the same loops you have. Adjust if you want fewer runs.
        for my_sector in bls_sector_levels:
            for my_onet_level, onet_occupation_code_var, onet_occupation_title_var in zip(onet_levels, onet_occupation_code_vars, onet_occupation_title_vars):
                if my_onet_level != 'detailed':
                    continue

                # Use reshuffled for occupation analysis
                occupation_analysis = create_occupation_analysis(my_sector, my_onet_level,
                                                                 reshuffled, onet_occupation_code_var, onet_occupation_title_var)

                for weight_col, plot_title_suffix, plot_save_name_prefix in zip(weight_cols, plot_title_suffix_list, plot_save_name_prefix_list):
                    for dependent_var, dependent_var_title, dependent_var_save_name_prefix in zip(dependent_var_list, dependent_var_title_list, dependent_var_save_name_prefix_list):
                        master_df = merge_industry_employment_shares(seed,
                                                                     my_sector,
                                                                    my_onet_level,
                                                                    dependent_var,
                                                                    onet_occupation_code_var, onet_occupation_title_var,
                                                                    weight_col,
                                                                    occupation_analysis)

                        plot_weighted_regression_and_binned_scatter(my_sector, my_onet_level,
                                                                   dependent_var, dependent_var_title, dependent_var_save_name_prefix,
                                                                   master_df, weight_col,
                                                                   plot_title_suffix, plot_save_name_prefix)
                    # run exposure vs AIability regression per weight_col
                    regress_exposure_on_AIability(my_sector, my_onet_level,
                                                  onet_occupation_code_var, onet_occupation_title_var,
                                                  weight_col)

        # 4) After finishing seed runs, run the same combining code you have that creates master_regressions.csv
        #    (Your notebook's combining code expects variables my_onet_level & my_sector from the last loop; to be safe, we'll recompute and call it similarly)
        # We'll create combined master_regressions within the seed folder:
        # reuse your combining logic but pointing at this seed's output folder
        try:
            # try to find all reg files under this seed output folder
            reg_files = glob.glob(os.path.join(seed_output_data_path, '**', 'reg_BLS*.csv'), recursive=True)
            if len(reg_files) == 0:
                print(f"[seed {seed}] No reg files found under {seed_output_data_path}; skipping combine.")
            else:
                combined = pd.concat([pd.read_csv(f) for f in reg_files], ignore_index=True)
                # Apply the same cleaning/sorting you do in the notebook
                combined = combined.sort_values(by=['model', 'BLS_sector_level', 'ONET_level', 'weight_col'], ascending=True).reset_index(drop=True)
                combined = combined[(combined['model'] == 'WLS') | (combined['model'] == 'WLS_exposure_on_AIability')].reset_index(drop=True)
                # drop model and unneeded cols if present
                if 'plot_prefix' in combined.columns:
                    combined = combined.drop(columns=['plot_prefix'])
                combined = combined[combined['ONET_level'] != 'major'] if 'ONET_level' in combined.columns else combined

                out_dir = os.path.join(seed_output_data_path, 'regressions')
                os.makedirs(out_dir, exist_ok=True)
                master_file = os.path.join(out_dir, 'master_regressions.csv')
                combined.to_csv(master_file, index=False)
                seed_master_files.append(master_file)
                if seed % 50 == 0:
                    print(f"[seed {seed}] Combined regressions -> {master_file}")
        except Exception as e:
            print(f"[seed {seed}] Failed to combine regression files: {e}")

    except Exception as e:
        print(f"[seed {seed}] ERROR during seed processing: {e}")
    # finally:
    #     # restore original output_data_path global
    #     if orig_output_data_path is None:
    #         globals().pop('output_data_path', None)
    #     else:
    #         globals()['output_data_path'] = orig_output_data_path

# ---- Collect coefficients across seeds and plot histograms (one plot per regression id) ----

# Prefer the seed_0 observed master file if it exists, otherwise look for any observed master not in seed_shuffles
seed0_master = os.path.join(seed_root, 'seed_0', 'regressions', 'master_regressions.csv')
if os.path.exists(seed0_master):
    observed_master_file = seed0_master
else:
    observed_master_candidates = glob.glob(os.path.join(base_output_dir, '**', 'master_regressions.csv'), recursive=True)
    # prefer one that is not in the seed_shuffles folder
    observed_master_candidates = [p for p in observed_master_candidates if 'seed_shuffles' not in p]
    observed_master_file = observed_master_candidates[0] if len(observed_master_candidates) > 0 else None

if observed_master_file is None:
    print("WARNING: Could not find an observed master_regressions.csv. Observed value will not be plotted.")
else:
    observed_df = pd.read_csv(observed_master_file)
    print(f"Found observed master file: {observed_master_file} ({len(observed_df)} rows)")

# Load per-seed master files into a single DataFrame with 'seed' column
seed_dfs = []
for seed in seeds:
    f = os.path.join(seed_root, f"seed_{seed}", "regressions", "master_regressions.csv")
    if os.path.exists(f):
        try:
            d = pd.read_csv(f)
            d['seed'] = seed
            seed_dfs.append(d)
        except Exception as e:
            print(f"[seed {seed}] failed to read {f}: {e}")
    else:
        # don't spam if many missing - print only occasionally
        print(f"[seed {seed}] master_regressions.csv not found at expected path {f}")

if len(seed_dfs) == 0:
    raise RuntimeError("No per-seed master_regressions found; aborting histogram plotting. Check that the seed runs produced regressions.")

all_seeds_df = pd.concat(seed_dfs, ignore_index=True)

# Identify coefficient column (try the commonly used names first)
# look in the per-seed combined DataFrame first, then observed if needed
coef_col = None
candidates = []
if 'coef_num_tasks' in all_seeds_df.columns:
    candidates.append('coef_num_tasks')
if 'coef_human_E1_fraction' in all_seeds_df.columns:
    candidates.append('coef_human_E1_fraction')
if len(candidates) == 0:
    candidates = [c for c in all_seeds_df.columns if str(c).startswith('coef')]
if len(candidates) == 0 and 'observed_df' in locals():
    # try observed file as a last resort
    candidates = [c for c in observed_df.columns if str(c).startswith('coef')]
if len(candidates) == 0:
    raise RuntimeError("Could not find a coefficient column in per-seed master_regressions or observed master. Look for 'coef_num_tasks' or other 'coef_*' columns.")
coef_col = candidates[0]

# Determine keys that uniquely identify a regression entry (use string names)
candidate_keys = ['BLS_sector_level', 'ONET_level', 'dependent_var', 'weight_col']
# prefer keys present in the per-seed dataframe
key_cols = [c for c in candidate_keys if c in all_seeds_df.columns]
# if observed exists and provides a better set of keys, prefer intersection that is present in both
if 'observed_df' in locals():
    obs_keys = [c for c in candidate_keys if c in observed_df.columns]
    if len(obs_keys) > 0:
        inter = [c for c in candidate_keys if c in obs_keys and c in all_seeds_df.columns]
        if len(inter) > 0:
            key_cols = inter
# final fallback: keep any sensible keys present in all_seeds_df
if len(key_cols) == 0:
    key_cols = [c for c in ['dependent_var', 'weight_col'] if c in all_seeds_df.columns]

print(f"Using coef column '{coef_col}' and keys {key_cols} to identify regressions.")

Reshuffle mode: preserve_unit_counts=True
Running full observed (original) pipeline and saving outputs under seed_shuffles/seed_0 ...
Saved observed master_regressions under seed_0 -> ../data/computed_objects/BLS_ONET_matchedEmpShares/seed_shuffles/seed_0/regressions/master_regressions.csv (9 rows)


Seeds: 100%|██████████| 1000/1000 [00:07<00:00, 128.10it/s]


Found observed master file: ../data/computed_objects/BLS_ONET_matchedEmpShares/seed_shuffles/seed_0/regressions/master_regressions.csv (9 rows)
Using coef column 'coef_num_tasks' and keys ['BLS_sector_level', 'ONET_level', 'dependent_var', 'weight_col'] to identify regressions.


In [None]:
# Prepare output dir for combined histograms
hist_out_dir = os.path.join(base_output_dir, "shuffle_histograms")
os.makedirs(hist_out_dir, exist_ok=True)

# Prefer regressions present in observed master if available, otherwise use union across seeds
if len(key_cols) == 0:
    # No clear keys available: fall back to plotting one histogram aggregating all seed coefficients
    unique_keys_df = pd.DataFrame([{}])
else:
    if 'observed_df' in locals():
        # Use observed regressions if possible (safer to plot what was actually run)
        obs_keys_present = [c for c in key_cols if c in observed_df.columns]
        if len(obs_keys_present) == len(key_cols):
            unique_keys_df = observed_df[key_cols].drop_duplicates().reset_index(drop=True)
        else:
            # observed missing some key columns: fall back to union across seeds
            unique_keys_df = all_seeds_df[key_cols].drop_duplicates().reset_index(drop=True)
    else:
        unique_keys_df = all_seeds_df[key_cols].drop_duplicates().reset_index(drop=True)

plots_created = 0
for idx, key_row in unique_keys_df.iterrows():
    # build boolean mask across all seeds; if no key_cols, this stays all True
    mask = np.ones(len(all_seeds_df), dtype=bool)
    label_parts = []
    for col in key_cols:
        # key_row may be an empty dict when key_cols == []
        val = key_row[col]
        mask &= (all_seeds_df[col] == val)
        label_parts.append(f"{val}")

    if len(key_cols) == 0:
        regression_id = "all_regressions"
    else:
        regression_id = "__".join([f"{col}-{str(key_row[col])}" for col in key_cols])

    coef_vals = all_seeds_df.loc[mask, coef_col].dropna().astype(float).values

    if coef_vals.size == 0:
        print(f"Skipping {regression_id}: no coef values found across seeds.")
        continue

    # Plot single histogram that aggregates coefficients from all seeds for this regression
    plt.figure(figsize=(9, 5))
    plt.hist(coef_vals, bins=min(30, max(5, int(len(coef_vals) / 2))), color='orange', edgecolor='k', alpha=0.9)
    plt.xlabel(coef_col)
    plt.ylabel('Count')
    title = f"{len(coef_vals)} Reshuffles: {' | '.join(label_parts) if len(label_parts)>0 else 'all'}\n\n(Randomized Weights, Fixed Task Assignment)"
    plt.title(title)

    # If observed data exists, overlay the observed coefficient as a dashed red line
    if 'observed_df' in locals():
        # if we have key_cols, try to match observed rows; otherwise use any observed coef available
        if len(key_cols) > 0:
            mask_obs = np.ones(len(observed_df), dtype=bool)
            for col in key_cols:
                if col in observed_df.columns:
                    mask_obs &= (observed_df[col] == key_row[col])
                else:
                    # cannot match on this key in observed; set mask_obs all False to skip
                    mask_obs &= False
            obs_series = observed_df.loc[mask_obs, coef_col].dropna().astype(float) if mask_obs.any() else pd.Series(dtype=float)
        else:
            obs_series = observed_df[coef_col].dropna().astype(float) if coef_col in observed_df.columns else pd.Series(dtype=float)

        if len(obs_series) > 0:
            obs_val = float(obs_series.iloc[0])
            plt.axvline(obs_val, color='red', linestyle='--', lw=2, label='Observed')
            plt.axvline(0, color='black', linestyle='--', lw=1.5)
            # annotate percentile: how many seeds are below the observed value
            percentile = (coef_vals < obs_val).mean() * 100.0
            plt.legend(loc = 'upper right', title=f'Observed (pct below: {percentile:.1f}% )')
            plt.xlim(-0.004, 0.004)

    out_file = os.path.join(hist_out_dir, f"hist_{regression_id}.png")
    plt.tight_layout()
    plt.savefig(out_file, dpi=300)
    plt.close()
    plots_created += 1
    print(f"Saved histogram for {regression_id} -> {out_file}")

print(f"Done: created {plots_created} histogram(s). Per-seed master_regressions (if produced) were stored under:\n  {seed_root}")
print(f"Histograms saved under:\n  {hist_out_dir}")

# ---- End cell ----

Saved histogram for BLS_sector_level-sector__ONET_level-detailed__dependent_var-human_E1_fraction__weight_col-occ_sectorEmpShare -> ../data/computed_objects/BLS_ONET_matchedEmpShares/shuffle_histograms/hist_BLS_sector_level-sector__ONET_level-detailed__dependent_var-human_E1_fraction__weight_col-occ_sectorEmpShare.png
Saved histogram for BLS_sector_level-sector__ONET_level-detailed__dependent_var-ai_fraction__weight_col-occ_sectorEmpShare -> ../data/computed_objects/BLS_ONET_matchedEmpShares/shuffle_histograms/hist_BLS_sector_level-sector__ONET_level-detailed__dependent_var-ai_fraction__weight_col-occ_sectorEmpShare.png
Saved histogram for BLS_sector_level-sector__ONET_level-detailed__dependent_var-ai_fraction__weight_col-occ_totalEmpShare -> ../data/computed_objects/BLS_ONET_matchedEmpShares/shuffle_histograms/hist_BLS_sector_level-sector__ONET_level-detailed__dependent_var-ai_fraction__weight_col-occ_totalEmpShare.png
Saved histogram for BLS_sector_level-sector__ONET_level-detailed__