#### By: Peyman Shahidi
#### Created: Nov 7, 2025
#### Last Edit: Nov 10, 2025

<br>

In [1]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [2]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects/contiguityMeasure'
output_plot_path = f"{main_folder_path}/writeup/plots/contiguityMeasure"

In [3]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [4]:
# One-sided window length (number of steps to look left/right)
window_length = 1

In [5]:
def create_occupation_analysis(df, onet_occupation_code_var, onet_occupation_title_var):
    # Create occupation-level analysis for scatter plots
    # Group by occupation and calculate label fractions and task counts
    occupation_stats = []

    for (soc_code, occ_title), group in df.groupby([onet_occupation_code_var, onet_occupation_title_var]):
        num_tasks = group['Task ID'].nunique()
        # num_occupations = group[onet_occupation_code_var].nunique()
        total_tasks = len(group)
        
        manual_fraction = (group['label'] == 'Manual').sum() / total_tasks
        augmentation_fraction = (group['label'] == 'Augmentation').sum() / total_tasks  
        automation_fraction = (group['label'] == 'Automation').sum() / total_tasks
        ai_fraction = augmentation_fraction + automation_fraction
        gpt4_E0_fraction = (group['gpt4_exposure'] == 'E0').sum() / total_tasks
        gpt4_E1_fraction = (group['gpt4_exposure'] == 'E1').sum() / total_tasks
        gpt4_E2_fraction = (group['gpt4_exposure'] == 'E2').sum() / total_tasks
        gpt4_aiExposure_fraction = gpt4_E1_fraction + gpt4_E2_fraction
        human_E0_fraction = (group['human_labels'] == 'E0').sum() / total_tasks
        human_E1_fraction = (group['human_labels'] == 'E1').sum() / total_tasks
        human_E2_fraction = (group['human_labels'] == 'E2').sum() / total_tasks
        human_aiExposure_fraction = human_E1_fraction + human_E2_fraction

        
        occupation_stats.append({
            f'{onet_occupation_code_var}': soc_code,
            f'{onet_occupation_title_var}': occ_title,
            'num_tasks': num_tasks,
            # 'num_occupations': num_occupations,
            'manual_fraction': manual_fraction,
            'ai_fraction': ai_fraction,
            'augmentation_fraction': augmentation_fraction,
            'automation_fraction': automation_fraction,
            'gpt4_E0_fraction': gpt4_E0_fraction,
            'gpt4_E1_fraction': gpt4_E1_fraction,
            'gpt4_E2_fraction': gpt4_E2_fraction,
            'gpt4_aiExposure_fraction': gpt4_aiExposure_fraction,
            'human_E0_fraction': human_E0_fraction,
            'human_E1_fraction': human_E1_fraction,
            'human_E2_fraction': human_E2_fraction,
            'human_aiExposure_fraction': human_aiExposure_fraction
        })

    occupation_analysis = pd.DataFrame(occupation_stats)

    return occupation_analysis

In [6]:
# Read the merged data
merged_data = pd.read_csv(f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv")

In [7]:
# # Drop the supplemental tasks
# merged_data = merged_data[merged_data['Task Type'] != 'Supplemental'].reset_index(drop=True)

# # Drop rows whose Occupation Title includes 'Teachers, Postsecondary'
# merged_data = merged_data[~merged_data['Occupation Title'].str.contains('Teachers, Postsecondary')].reset_index(drop=True)

## Calculate Contiguity Measure defined as an indicator for whether +_ X tasks next to a task are AI Exposed (measured via E1 AI exposure measure)

In [8]:
# Calculate fragmentation index
# For the sake of this exercise we treat all AI tasks as a single category

# Create is_exposed column
cm_df = merged_data[['O*NET-SOC Code', 'Occupation Title', 'Task Position', 'Task ID', 'human_labels']].copy()
cm_df['human_labels'] = cm_df['human_labels'].fillna('E0')
cm_df['is_exposed'] = cm_df['human_labels'].isin(['E1']).astype(int)

# Ensure order within each occupation group (adjust if your ordering col is different)
group_cols = ['O*NET-SOC Code', 'Occupation Title']
order_cols = group_cols + (['Task Position'] if 'Task Position' in cm_df.columns else [])
cm_df = cm_df.sort_values(order_cols)

g = cm_df.groupby(group_cols, sort=False)

# --- Build all shifted exposure columns systematically ---
shifts = {}
for i in range(1, window_length + 1):
    shifts[f'previous_is_exposed_{i}'] = g['is_exposed'].shift(i).fillna(0).astype('int8')
    shifts[f'next_is_exposed_{i}']     = g['is_exposed'].shift(-i).fillna(0).astype('int8')

cm_df = cm_df.assign(**shifts)

# Helpers to reduce across an arbitrary list of Series
def max_reduce(series_list, default=0):
    if not series_list:
        return pd.Series(default, index=cm_df.index, dtype='int8')
    out = series_list[0].copy()
    for s in series_list[1:]:
        out = np.maximum(out, s)
    return out.astype('int8')

def min_reduce(series_list, default=1):
    if not series_list:
        return pd.Series(default, index=cm_df.index, dtype='int8')
    out = series_list[0].copy()
    for s in series_list[1:]:
        out = np.minimum(out, s)
    return out.astype('int8')

prev_cols = [cm_df[f'previous_is_exposed_{i}'] for i in range(1, window_length + 1)]
next_cols = [cm_df[f'next_is_exposed_{i}']     for i in range(1, window_length + 1)]

# 1) STRICT contiguity: entire block of size (2*W+1) is exposed
strict_block = min_reduce([cm_df['is_exposed']] + prev_cols + next_cols, default=1)
cm_df['is_contiguous_strict'] = strict_block  # 1 iff every neighbor within the window is exposed

# 2) ANY-ON-BOTH-SIDES contiguity: at least one exposed on the left AND one on the right (within window)
left_any  = max_reduce(prev_cols, default=0)
right_any = max_reduce(next_cols, default=0)
cm_df['is_contiguous_any_sides'] = (cm_df['is_exposed'].values & left_any.values & right_any.values).astype('int8')

# 3) Useful summaries: counts/shares in the window (incl. self)
neighbors_sum = sum(prev_cols + next_cols) if window_length > 0 else 0
cm_df['num_exposed_in_window']   = (neighbors_sum + cm_df['is_exposed']).astype('int16')
cm_df['share_exposed_in_window'] = cm_df['num_exposed_in_window'] / (2*window_length + 1)

# 4) Optional: length of the contiguous exposed block that includes the focal task
def exposed_block_len(s):
    grp = (s != s.shift()).cumsum()
    seg_size = s.groupby(grp).transform('size')
    return np.where(s.eq(1), seg_size, 0).astype('int16')

cm_df['exposed_block_len'] = g['is_exposed'].transform(exposed_block_len)

# A single flag choosing one of the options:
cm_df['is_contiguous'] = cm_df['is_contiguous_strict']

# Sort within groups by Task Position (ascending)
cm_df = cm_df.sort_values(['O*NET-SOC Code', 'Occupation Title', 'Task Position']).reset_index(drop=True)

# Index within group (0-based) and group sizes
g = cm_df.groupby(['O*NET-SOC Code', 'Occupation Title'], sort=False)
cm_df['_i'] = g.cumcount()
cm_df['_n'] = g['Task Position'].transform('size')

# Keep only interior rows: window_length ... n-1-window_length
keep = (cm_df['_i'] >= window_length) & (cm_df['_i'] < cm_df['_n'] - window_length)
cm_df = cm_df.loc[keep].drop(columns=['_i','_n']).reset_index(drop=True)

# Keep only relevant columns
cm_df = cm_df[['O*NET-SOC Code', 'Occupation Title', 'Task Position', 'Task ID', 'human_labels', 'share_exposed_in_window']]


# Create is_ai column
ai_df = merged_data[['O*NET-SOC Code', 'Occupation Title', 'Task ID', 'label']].copy()
ai_df['is_ai'] = ai_df['label'].isin(['Automation', 'Augmentation']).astype(int)

# Merge with cm_df 
cm_df = cm_df.merge(ai_df, on=['O*NET-SOC Code', 'Occupation Title', 'Task ID'], how='left')

# Save contiguity measure data
cm_df.to_csv(f"{output_data_path}/contiguityMeasure.csv", index=False)
cm_df[cm_df['O*NET-SOC Code']=='15-1251.00']

Unnamed: 0,O*NET-SOC Code,Occupation Title,Task Position,Task ID,human_labels,share_exposed_in_window,label,is_ai
1959,15-1251.00,Computer Programmers,2,1277,E2,0.0,Manual,0
1960,15-1251.00,Computer Programmers,3,1276,E2,0.33,Automation,1
1961,15-1251.00,Computer Programmers,4,1273,E1,0.67,Augmentation,1
1962,15-1251.00,Computer Programmers,5,1270,E1,1.0,Automation,1
1963,15-1251.00,Computer Programmers,6,21049,E1,0.67,Manual,0
1964,15-1251.00,Computer Programmers,7,1269,E2,0.67,Augmentation,1
1965,15-1251.00,Computer Programmers,8,1268,E1,0.33,Manual,0
1966,15-1251.00,Computer Programmers,9,1275,E0,0.67,Augmentation,1
1967,15-1251.00,Computer Programmers,10,1267,E1,0.33,Automation,1
1968,15-1251.00,Computer Programmers,11,1272,E2,0.33,Automation,1


In [9]:
# Add back SOC levels for fixed effects in regressions
my_onet_level = 'detailed'
onet_occupation_code_var = 'Detailed_Occupation_Code'
onet_occupation_title_var = 'Detailed_Occupation_Title'

# Read OG occupation analysis with SOC mappings
ONET = pd.read_csv(f"{input_data_path}/computed_objects/ONET_cleaned_tasks.csv")

# Keep only the relevant 
SOC_mappings = ONET[['O*NET-SOC Code', 'Occupation Title',
                     'Major_Group_Code', 'Major_Group_Title',
                     'Minor_Group_Code', 'Minor_Group_Title',
                     'Broad_Occupation_Code', 'Broad_Occupation_Title',
                     'Detailed_Occupation_Code', 'Detailed_Occupation_Title']].copy()
SOC_mappings = SOC_mappings.drop_duplicates(subset=['O*NET-SOC Code', onet_occupation_code_var])

# Merge SOC levels with the occupation analysis
cm_df = cm_df.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')

In [10]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Helper: compact table for selected vars
def coef_table(res, vars_):
    ci = res.conf_int().rename(columns={0: 'ci_low', 1: 'ci_high'})
    out = (pd.DataFrame({'coef': res.params, 'se': res.bse,
                         't': res.tvalues, 'p': res.pvalues})
           .join(ci)
           .loc[vars_])
    return out

vars_of_interest = ['share_exposed_in_window']

# --- Model A: no FE ---
mod_a = smf.logit(
    formula=f'is_ai ~ share_exposed_in_window',
    data=cm_df
).fit(cov_type="cluster",
    cov_kwds={
        "groups": cm_df['O*NET-SOC Code'],
        "use_correction": True,
        "df_correction": True
    })
print("\n\n=== Model A — selected coefficients ===")
print(coef_table(mod_a, vars_of_interest))


# For fixed effects, some major and minor groups have no variation in is_ai and thus lead to singular matrices
# Remove those from the analysis
def sanitize_for_fe_logit(df, fe, y, xcols, min_obs=3):
    d = df.dropna(subset=[fe, y] + xcols).copy()

    # 1) enough observations per FE
    cnt = d.groupby(fe).size()
    good = cnt[cnt >= min_obs].index
    d = d[d[fe].isin(good)]

    # 2) outcome must vary within FE (avoid complete separation by intercept)
    yvar = d.groupby(fe)[y].nunique()
    good = yvar[yvar > 1].index
    d = d[d[fe].isin(good)]

    # 3) regressors must vary within FE
    for x in xcols:
        xv = d.groupby(fe)[x].var()
        good = xv[xv > 0].index
        d = d[d[fe].isin(good)]

    return d

# —— Major FE ——
df_major = sanitize_for_fe_logit(cm_df, 'Major_Group_Code', 'is_ai', ['share_exposed_in_window'])

mod_b = smf.logit(
    'is_ai ~ share_exposed_in_window + C(Major_Group_Code)',
    data=df_major
).fit(method='lbfgs', maxiter=200, disp=0,
     cov_type='cluster',
     cov_kwds={'groups': df_major['Major_Group_Code'],
               'use_correction': True, 'df_correction': True})
print("\n\n=== Model B — selected coefficients ===")
print(coef_table(mod_b, vars_of_interest))

# —— Minor FE ——
df_minor = sanitize_for_fe_logit(cm_df, 'Minor_Group_Code', 'is_ai', ['share_exposed_in_window'])

# Try standard MLE first
mod_c = smf.logit(
        'is_ai ~ share_exposed_in_window + C(Minor_Group_Code)',
        data=df_minor
    ).fit(method='lbfgs', maxiter=300, disp=0,
         cov_type='cluster',
         cov_kwds={'groups': df_minor['Minor_Group_Code'],
                   'use_correction': True, 'df_correction': True})
print("\n\n=== Model C — selected coefficients ===")
print(coef_table(mod_c, vars_of_interest))


# Now run models A and B using the minor_df for sanity check:
mod_d = smf.logit(
        'is_ai ~ share_exposed_in_window',
        data=df_minor
    ).fit(method='lbfgs', maxiter=300, disp=0,
         cov_type='cluster',
         cov_kwds={'groups': df_minor['Minor_Group_Code'],
                   'use_correction': True, 'df_correction': True})
print("\n\n=== Model A with minor_df — selected coefficients ===")
print(coef_table(mod_d, vars_of_interest))

mod_e = smf.logit(
        'is_ai ~ share_exposed_in_window + C(Major_Group_Code)',
        data=df_minor
    ).fit(method='lbfgs', maxiter=300, disp=0,
         cov_type='cluster',
         cov_kwds={'groups': df_minor['Minor_Group_Code'],
                   'use_correction': True, 'df_correction': True})
print("\n\n=== Model B with minor_df — selected coefficients ===")
print(coef_table(mod_e, vars_of_interest))

Optimization terminated successfully.
         Current function value: 0.370499
         Iterations 6


=== Model A — selected coefficients ===
                         coef   se     t    p  ci_low  ci_high
share_exposed_in_window  1.94 0.11 17.83 0.00    1.73     2.15


=== Model B — selected coefficients ===
                         coef   se    t    p  ci_low  ci_high
share_exposed_in_window  0.81 0.14 5.81 0.00    0.53     1.08


=== Model C — selected coefficients ===
                         coef   se    t    p  ci_low  ci_high
share_exposed_in_window  0.68 0.16 4.23 0.00    0.37     1.00


=== Model A with minor_df — selected coefficients ===
                         coef   se    t    p  ci_low  ci_high
share_exposed_in_window  1.84 0.22 8.28 0.00    1.41     2.28


=== Model B with minor_df — selected coefficients ===
                         coef   se    t    p  ci_low  ci_high
share_exposed_in_window  0.81 0.17 4.90 0.00    0.49     1.13


In [11]:
# Batch-run regressions over many reshuffled datasets and save coefficients
import os
results = []

# Reuse the same contiguity calculation as above but as a function
def compute_cm_df(merged, window_length):
    cm_df = merged[['O*NET-SOC Code', 'Occupation Title', 'Task Position', 'Task ID', 'human_labels']].copy()
    cm_df['human_labels'] = cm_df['human_labels'].fillna('E0')
    cm_df['is_exposed'] = cm_df['human_labels'].isin(['E1']).astype(int)
    group_cols = ['O*NET-SOC Code', 'Occupation Title']
    order_cols = group_cols + (['Task Position'] if 'Task Position' in cm_df.columns else [])
    cm_df = cm_df.sort_values(order_cols)

    g = cm_df.groupby(group_cols, sort=False)
    shifts = {}
    for i in range(1, window_length + 1):
        shifts[f'previous_is_exposed_{i}'] = g['is_exposed'].shift(i).fillna(0).astype('int8')
        shifts[f'next_is_exposed_{i}']     = g['is_exposed'].shift(-i).fillna(0).astype('int8')
    cm_df = cm_df.assign(**shifts)

    def max_reduce(series_list, default=0):
        if not series_list:
            return pd.Series(default, index=cm_df.index, dtype='int8')
        out = series_list[0].copy()
        for s in series_list[1:]:
            out = np.maximum(out, s)
        return out.astype('int8')

    def min_reduce(series_list, default=1):
        if not series_list:
            return pd.Series(default, index=cm_df.index, dtype='int8')
        out = series_list[0].copy()
        for s in series_list[1:]:
            out = np.minimum(out, s)
        return out.astype('int8')

    prev_cols = [cm_df[f'previous_is_exposed_{i}'] for i in range(1, window_length + 1)]
    next_cols = [cm_df[f'next_is_exposed_{i}']     for i in range(1, window_length + 1)]

    strict_block = min_reduce([cm_df['is_exposed']] + prev_cols + next_cols, default=1)
    cm_df['is_contiguous_strict'] = strict_block
    left_any  = max_reduce(prev_cols, default=0)
    right_any = max_reduce(next_cols, default=0)
    cm_df['is_contiguous_any_sides'] = (cm_df['is_exposed'].values & left_any.values & right_any.values).astype('int8')

    neighbors_sum = sum(prev_cols + next_cols) if window_length > 0 else 0
    cm_df['num_exposed_in_window']   = (neighbors_sum + cm_df['is_exposed']).astype('int16')
    cm_df['share_exposed_in_window'] = cm_df['num_exposed_in_window'] / (2*window_length + 1)

    def exposed_block_len(s):
        grp = (s != s.shift()).cumsum()
        seg_size = s.groupby(grp).transform('size')
        return np.where(s.eq(1), seg_size, 0).astype('int16')

    cm_df['exposed_block_len'] = g['is_exposed'].transform(exposed_block_len)
    cm_df['is_contiguous'] = cm_df['is_contiguous_strict']

    cm_df = cm_df.sort_values(['O*NET-SOC Code', 'Occupation Title', 'Task Position']).reset_index(drop=True)
    g = cm_df.groupby(['O*NET-SOC Code', 'Occupation Title'], sort=False)
    cm_df['_i'] = g.cumcount()
    cm_df['_n'] = g['Task Position'].transform('size')
    keep = (cm_df['_i'] >= window_length) & (cm_df['_i'] < cm_df['_n'] - window_length)
    cm_df = cm_df.loc[keep].drop(columns=['_i','_n']).reset_index(drop=True)

    cm_df = cm_df[['O*NET-SOC Code', 'Occupation Title', 'Task Position', 'Task ID', 'human_labels', 'share_exposed_in_window']]
    ai_df = merged[['O*NET-SOC Code', 'Occupation Title', 'Task ID', 'label']].copy()
    ai_df['is_ai'] = ai_df['label'].isin(['Automation', 'Augmentation']).astype(int)
    cm_df = cm_df.merge(ai_df, on=['O*NET-SOC Code', 'Occupation Title', 'Task ID'], how='left')
    if 'SOC_mappings' in globals():
        cm_df = cm_df.merge(SOC_mappings, on=['O*NET-SOC Code', 'Occupation Title'], how='left')
    return cm_df

# Fit models and extract coefficients (robust to errors)
def fit_models_and_extract(cm_df):
    out = {}
    # Model A: no FE (cluster by O*NET-SOC Code)
    try:
        m = smf.logit(formula='is_ai ~ share_exposed_in_window', data=cm_df).fit(
            disp=0,
            cov_type='cluster',
            cov_kwds={
                'groups': cm_df['O*NET-SOC Code'],
                'use_correction': True,
                'df_correction': True
            }
        )
        out['mod_a_coef'] = m.params.get('share_exposed_in_window', np.nan)
        out['mod_a_se'] = m.bse.get('share_exposed_in_window', np.nan)
        out['mod_a_p'] = m.pvalues.get('share_exposed_in_window', np.nan)
    except Exception as e:
        out['mod_a_error'] = str(e)
        out['mod_a_coef'] = np.nan; out['mod_a_se'] = np.nan; out['mod_a_p'] = np.nan

    # Model B: Major FE
    try:
        df_major = sanitize_for_fe_logit(cm_df, 'Major_Group_Code', 'is_ai', ['share_exposed_in_window'])
        m2 = smf.logit('is_ai ~ share_exposed_in_window + C(Major_Group_Code)', data=df_major).fit(
            method='lbfgs', maxiter=200, disp=0,
            cov_type='cluster',
            cov_kwds={'groups': df_major['Major_Group_Code'], 'use_correction': True, 'df_correction': True}
        )
        out['mod_b_coef'] = m2.params.get('share_exposed_in_window', np.nan)
        out['mod_b_se'] = m2.bse.get('share_exposed_in_window', np.nan)
        out['mod_b_p'] = m2.pvalues.get('share_exposed_in_window', np.nan)
    except Exception as e:
        out['mod_b_error'] = str(e)
        out['mod_b_coef'] = np.nan; out['mod_b_se'] = np.nan; out['mod_b_p'] = np.nan

    # Model C: Minor FE
    try:
        df_minor = sanitize_for_fe_logit(cm_df, 'Minor_Group_Code', 'is_ai', ['share_exposed_in_window'])
        m3 = smf.logit('is_ai ~ share_exposed_in_window + C(Minor_Group_Code)', data=df_minor).fit(
            method='lbfgs', maxiter=300, disp=0,
            cov_type='cluster',
            cov_kwds={'groups': df_minor['Minor_Group_Code'], 'use_correction': True, 'df_correction': True}
        )
        out['mod_c_coef'] = m3.params.get('share_exposed_in_window', np.nan)
        out['mod_c_se'] = m3.bse.get('share_exposed_in_window', np.nan)
        out['mod_c_p'] = m3.pvalues.get('share_exposed_in_window', np.nan)
    except Exception as e:
        out['mod_c_error'] = str(e)
        out['mod_c_coef'] = np.nan; out['mod_c_se'] = np.nan; out['mod_c_p'] = np.nan

    # Model D: no FE but on df_minor (sanity check)
    try:
        # ensure df_minor is available (recompute if previous failed)
        if 'df_minor' not in locals():
            df_minor = sanitize_for_fe_logit(cm_df, 'Minor_Group_Code', 'is_ai', ['share_exposed_in_window'])
        m4 = smf.logit('is_ai ~ share_exposed_in_window', data=df_minor).fit(
            method='lbfgs', maxiter=300, disp=0,
            cov_type='cluster',
            cov_kwds={'groups': df_minor['Minor_Group_Code'], 'use_correction': True, 'df_correction': True}
        )
        out['mod_d_coef'] = m4.params.get('share_exposed_in_window', np.nan)
        out['mod_d_se'] = m4.bse.get('share_exposed_in_window', np.nan)
        out['mod_d_p'] = m4.pvalues.get('share_exposed_in_window', np.nan)
    except Exception as e:
        out['mod_d_error'] = str(e)
        out['mod_d_coef'] = np.nan; out['mod_d_se'] = np.nan; out['mod_d_p'] = np.nan

    # Model E: Major FE but fitted on df_minor (as in earlier mod_e)
    try:
        if 'df_minor' not in locals():
            df_minor = sanitize_for_fe_logit(cm_df, 'Minor_Group_Code', 'is_ai', ['share_exposed_in_window'])
        m5 = smf.logit('is_ai ~ share_exposed_in_window + C(Major_Group_Code)', data=df_minor).fit(
            method='lbfgs', maxiter=300, disp=0,
            cov_type='cluster',
            cov_kwds={'groups': df_minor['Minor_Group_Code'], 'use_correction': True, 'df_correction': True}
        )
        out['mod_e_coef'] = m5.params.get('share_exposed_in_window', np.nan)
        out['mod_e_se'] = m5.bse.get('share_exposed_in_window', np.nan)
        out['mod_e_p'] = m5.pvalues.get('share_exposed_in_window', np.nan)
    except Exception as e:
        out['mod_e_error'] = str(e)
        out['mod_e_coef'] = np.nan; out['mod_e_se'] = np.nan; out['mod_e_p'] = np.nan

    return out



# Run the reshuffle analysis if the output file does not already exist
out_file = f"{output_data_path}/reshuffle_regression_results_win{window_length}.csv"
if os.path.exists(out_file):
    print('Reshuffle regression results file already exists at', out_file)
    print('Skipping reshuffle analysis to avoid overwriting existing results.')
    print('If you want to rerun the analysis, please delete the existing file first.')

else:
    # 1) Add the observed/original dataset results (merged_data exists earlier in the notebook)
    try:
        cm_obs = compute_cm_df(merged_data, window_length)
        row_obs = {'seed': 'observed'}
        row_obs.update(fit_models_and_extract(cm_obs))
        results.append(row_obs)
    except Exception as e:
        print('Failed to compute observed dataset results:', e)

    # 2) Loop over reshuffled files (attempt up to 1000) and collect results
    n_rep = 1000
    base_dir = f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/taskReshuffled_preserveCounts"
    for seed in range(n_rep):
        if seed % 50 == 0:
            print(f'Iteration {seed+1} of {n_rep}')
        
        fpath = os.path.join(base_dir, f'ONET_Eloundou_Anthropic_GPT_iter{seed+1}.csv')
        if not os.path.exists(fpath):
            # Skip missing files (prints so user knows)
            if seed < 5:
                print(f'Missing {fpath} (skipping)')
            continue
        try:
            resh = pd.read_csv(fpath)
            cm = compute_cm_df(resh, window_length)
            row = {'seed': int(seed)}
            row.update(fit_models_and_extract(cm))
            results.append(row)
        except Exception as e:
            print(f'Failed seed {seed}: {e}')

    # Save results to CSV for later plotting/comparison
    res_df = pd.DataFrame(results)
    out_file = f"{output_data_path}/reshuffle_regression_results_win{window_length}.csv"
    res_df.to_csv(out_file, index=False)
    print('Wrote regression results to', out_file)
    res_df.head()

Reshuffle regression results file already exists at ../data/computed_objects/contiguityMeasure/reshuffle_regression_results_win1.csv
Skipping reshuffle analysis to avoid overwriting existing results.
If you want to rerun the analysis, please delete the existing file first.


In [12]:
# Plot histogram of reshuffled coefficients and highlight observed value
import matplotlib.pyplot as plt

# Define models and their explanations
models = ['mod_a_coef', 'mod_b_coef', 'mod_c_coef', 'mod_d_coef', 'mod_e_coef']
model_explanation = ['No FE', 'Major FE', 'Minor FE', 'No FE (minor df)', 'Major FE (minor df)']
plot_colors = ['steelblue', 'orange', 'green', 'steelblue', 'orange']

# Loop over models to create histograms
for model, explanation, plot_color in zip(models, model_explanation, plot_colors):
    plot_df = pd.read_csv(f"{output_data_path}/reshuffle_regression_results_win{window_length}.csv")
    # Separate reshuffled (exclude 'observed')
    reshuffled = plot_df[plot_df['seed'] != 'observed'] if 'seed' in plot_df.columns else plot_df
    resh_vals = pd.to_numeric(reshuffled[model], errors='coerce').dropna()
    obs_row = plot_df[plot_df['seed'] == 'observed'] if 'seed' in plot_df.columns else pd.DataFrame()
    obs_val = float(obs_row[model].iloc[0])

    pct = (resh_vals <= obs_val).mean() * 100
    plt.figure(figsize=(8,5))
    plt.hist(resh_vals, bins=50, color=plot_color, edgecolor='black')
    plt.axvline(obs_val, color='red', linestyle='--', linewidth=2, label=f'Observed (pctl={pct:.1f}%)')
    plt.xlabel('Regression Coefficient')
    plt.ylabel('Count')
    plt.title(f'Histogram of Reshuffled Task Assignments - {explanation} Regression (n={len(resh_vals)})\n\nWindow Length = {window_length*2 +1}, Dependent Var: is_ai')
    plt.legend()
    plt.tight_layout()
    # save plot
    save_path = f"{output_plot_path}/win{window_length}"
    os.makedirs(save_path, exist_ok=True)
    plt.savefig(f'{save_path}/hist_win{window_length}_{model}.png', dpi=300)
    print('Saved plot to', save_path)
    plt.close()

Saved plot to ../writeup/plots/contiguityMeasure/win1
Saved plot to ../writeup/plots/contiguityMeasure/win1
Saved plot to ../writeup/plots/contiguityMeasure/win1
Saved plot to ../writeup/plots/contiguityMeasure/win1
Saved plot to ../writeup/plots/contiguityMeasure/win1
