#### By: Peyman Shahidi
#### Created: Oct 19, 2025
#### Last Edit: Dec 13, 2025

<br>

In [395]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [396]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects/execTypeVaryingDWA_anthropicIndex'
output_plot_path = f"{main_folder_path}/writeup/plots/execTypeVaryingDWA"

In [397]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

## Set variables

In [398]:
# Number of reshuffles
n_shuffles = 1000


dependent_var = 'is_ai'
plot_title_variable = 'Task is AI'
# dependent_var = 'is_automated'
# plot_title_variable = 'Task is Automated'


TARGET_REGS = ['prev2_is_ai', 'prev_is_ai', 'next_is_ai', 'next2_is_ai']
SPECS = ['no_fe', 'fe_MajorGroup', 'fe_MinorGroup']

PLOT_TITLES = ['Task Before Previous Task', 'Previous Task', 'Next Task', 'Task After Next Task']

### Main Code

In [399]:
# Get list of DWAs with tasks in multiple occupations
dwa_list_path = f"{input_data_path}/computed_objects/similar_dwa_tasks/similarTasks"

# Read all CSV files
import glob
dwa_csv_files = glob.glob(os.path.join(dwa_list_path, "*.csv"))
print(f"Found {len(dwa_csv_files)} DWA CSV files.")

# Load them into DataFrames, skipping 1-row files
dwa_dfs = []
skipped_files_count = 0
for f in dwa_csv_files:
    df = pd.read_csv(f)
    if len(df) > 1: # Skip if DWA contains only one task
        dwa_dfs.append(df)
    else:
        skipped_files_count += 1
print(f"Skipped {skipped_files_count} DWA files with only one task.")
    

# Combine into one DataFrame
df_all = pd.concat(dwa_dfs, ignore_index=True)
repetitive_dwa_task_ids = df_all['Task ID'].unique().tolist()
repetitive_dwa_task_titles = df_all['Task Title'].unique().tolist()
print(f"Found {len(repetitive_dwa_task_ids)} tasks related to these DWAs.")

Found 2047 DWA CSV files.
Skipped 47 DWA files with only one task.
Found 13535 tasks related to these DWAs.


In [400]:
# Create dataframe with all tasks that have survived the DWA task similarity procedure
survived_tasks_count_df = df_all.groupby('DWA ID')['Task ID'].nunique().reset_index(name='num_tasks_survived')
survived_tasks_count_df

Unnamed: 0,DWA ID,num_tasks_survived
0,4.A.1.a.1.I01.D01,3
1,4.A.1.a.1.I01.D02,2
2,4.A.1.a.1.I01.D03,2
3,4.A.1.a.1.I01.D04,7
4,4.A.1.a.1.I02.D01,4
...,...,...
1995,4.A.4.c.3.I07.D01,9
1996,4.A.4.c.3.I07.D02,4
1997,4.A.4.c.3.I07.D03,10
1998,4.A.4.c.3.I07.D04,19


In [401]:
# Create a DWA-level dataset with number of tasks and occupations per DWA, as well as fraction of manual, automation, and augmentation tasks per DWA
merged_data = pd.read_csv(f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv")
merged_data['is_manual'] = merged_data['label'] == 'Manual'
merged_data['is_automation'] = merged_data['label'] == 'Automation'
merged_data['is_augmentation'] = merged_data['label'] == 'Augmentation'


# Merge back DWA ID and DWA Titles to the merged_data
dwa_task_mapping = pd.read_csv(f"{input_data_path}/computed_objects/similar_dwa_tasks/dwa_task_mapping.csv")
print(f'Length of merged_data before merging DWA info: {merged_data.shape[0]}')
merged_data = merged_data.merge(dwa_task_mapping, on=['Task ID', 'Task Title', 'O*NET-SOC Code', 'Occupation Title'], how='left')
print(f'Length of merged_data after merging DWA info: {merged_data.shape[0]}')


# Aggregate to get fractions
dwa_grouped = merged_data.groupby(['DWA ID', 'DWA Title']).agg(
    num_tasks = ('Task ID', 'nunique'),
    num_occupations = ('O*NET-SOC Code', 'nunique'),
    fraction_manual = ('is_manual', 'mean'),
    fraction_automation = ('is_automation', 'mean'),
    fraction_augmentation = ('is_augmentation', 'mean'),
).reset_index()
print(f"Created DWA-level dataset with {dwa_grouped.shape[0]} DWAs.")

# Keep only DWAs with variation in terms of execution type across occupations
dwa_grouped_filtered = dwa_grouped[
     (dwa_grouped['num_occupations'] > 1) & (dwa_grouped['fraction_manual'] > 0) & (dwa_grouped['fraction_manual'] < 1)
].copy()
display(dwa_grouped_filtered)

# Create list of DWAs with varying execution types
dwas_varying_exec_types_ids = dwa_grouped_filtered['DWA ID'].unique().tolist()
dwas_varying_exec_types_titles = dwa_grouped_filtered['DWA Title'].unique().tolist()
print(f"Identified {len(dwas_varying_exec_types_ids)} DWAs with varying execution types across occupations.")

# Merge back the number of tasks survived info
dwa_grouped_filtered = dwa_grouped_filtered.merge(survived_tasks_count_df, left_on='DWA ID', right_on='DWA ID', how='left')

# Save output
dwa_grouped_filtered.to_csv(f"{output_data_path}/dwas_varying_execution_types.csv", index=False)

Length of merged_data before merging DWA info: 17925
Length of merged_data after merging DWA info: 22267
Created DWA-level dataset with 2081 DWAs.


Unnamed: 0,DWA ID,DWA Title,num_tasks,num_occupations,fraction_manual,fraction_automation,fraction_augmentation
0,4.A.1.a.1.I01.D01,Review art or design materials.,6,4,0.83,0.00,0.17
2,4.A.1.a.1.I01.D03,Review production information to determine cos...,6,2,0.83,0.00,0.17
3,4.A.1.a.1.I01.D04,Study scripts to determine project requirements.,9,8,0.33,0.56,0.11
4,4.A.1.a.1.I02.D01,Read materials to determine needed actions.,4,4,0.75,0.25,0.00
5,4.A.1.a.1.I02.D02,Read maps to determine routes.,4,4,0.75,0.00,0.25
...,...,...,...,...,...,...,...
2066,4.A.4.c.3.I05.D03,"Purchase materials, equipment, or other resour...",22,20,0.95,0.00,0.05
2069,4.A.4.c.3.I05.D06,Purchase products or services.,15,10,0.93,0.07,0.00
2072,4.A.4.c.3.I06.D01,Prescribe treatments or therapies.,19,16,0.89,0.00,0.11
2074,4.A.4.c.3.I06.D03,Prescribe medications.,31,26,0.97,0.00,0.03


Identified 831 DWAs with varying execution types across occupations.


In [402]:
# Read the merged data
merged_data = pd.read_csv(f"{input_data_path}/computed_objects/ONET_Eloundou_Anthropic_GPT/ONET_Eloundou_Anthropic_GPT.csv")
merged_data = merged_data[['O*NET-SOC Code', 'Occupation Title', 'Task ID', 'Task Title',
       'Task Position', 'Task Type', 
       'Major_Group_Code', 'Major_Group_Title', 
       'Minor_Group_Code', 'Minor_Group_Title',
       'Broad_Occupation_Code', 'Broad_Occupation_Title',
       'Detailed_Occupation_Code', 'Detailed_Occupation_Title',
       'gpt4_exposure', 'human_labels', 
       'automation', 'augmentation', 'label']]


# Create is_ai and is_automated flags in merged_data
merged_data['is_ai'] = merged_data['label'].isin(['Augmentation','Automation']).astype(int)
merged_data['is_automated'] = merged_data['label'].isin(['Automation']).astype(int)
merged_data['is_exposed'] = merged_data['human_labels'].isin(['E1']).astype(int)


# Step 1: Add occupation's number of tasks info
num_tasks_per_occupation = merged_data.groupby('O*NET-SOC Code')['Task ID'].nunique().reset_index()
num_tasks_per_occupation = num_tasks_per_occupation.rename(columns={'Task ID': 'num_tasks'})
merged_data = merged_data.merge(num_tasks_per_occupation, on='O*NET-SOC Code', how='left')


# Step 2: Create flags for previous/next tasks is AI within occupation groups
# Sort by occupation and position when possible
merged_data['Task Position'] = pd.to_numeric(merged_data['Task Position'], errors='coerce')
merged_data = merged_data.sort_values(['O*NET-SOC Code', 'Task Position']).reset_index(drop=True)
group_col = 'O*NET-SOC Code'

# Compute neighbor flags (prev/next) within occupation groups when possible
merged_data['prev_is_ai'] = 0
merged_data['prev2_is_ai'] = 0
merged_data['next_is_ai'] = 0
merged_data['next2_is_ai'] = 0
pos_col = 'Task Position'

def add_neighbor_flags(df):
    df = df.copy()
    df['Task Position'] = pd.to_numeric(df['Task Position'], errors='coerce')
    df = df.sort_values(['O*NET-SOC Code','Task Position']).reset_index(drop=True)
    
    def _add_flags(g):
        g = g.sort_values('Task Position')
        # Don't fill NAs - leave them as NaN
        g['prev_is_ai'] = g['is_ai'].shift(1)
        g['prev2_is_ai'] = g['is_ai'].shift(2)
        g['next_is_ai'] = g['is_ai'].shift(-1)
        g['next2_is_ai'] = g['is_ai'].shift(-2)
        return g
    
    return df.groupby('O*NET-SOC Code', group_keys=False).apply(_add_flags).reset_index(drop=True)

# Apply the function
merged_data = merged_data.groupby(group_col, group_keys=False).apply(add_neighbor_flags).reset_index(drop=True)

# Drop rows where ANY neighbor flag is NA
neighbor_cols = ['prev_is_ai', 'prev2_is_ai', 'next_is_ai', 'next2_is_ai']
merged_data = merged_data.dropna(subset=neighbor_cols).reset_index(drop=True)

# Convert to int after dropping NAs
for col in neighbor_cols:
    merged_data[col] = merged_data[col].astype(int)



# Step 3: Add back DWA info
# Merge back DWA ID and DWA Titles to the merged_data
dwa_task_mapping = pd.read_csv(f"{input_data_path}/computed_objects/similar_dwa_tasks/dwa_task_mapping.csv")
merged_data = merged_data.merge(dwa_task_mapping, on=['Task ID', 'Task Title', 'O*NET-SOC Code', 'Occupation Title'], how='left')
# Note that the merge might map multiple DWAs to the same task


# Step 4: Flag "similar" tasks across occupations
merged_data['dwa_execType_varying'] = (
    (merged_data['DWA ID'].isin(dwas_varying_exec_types_ids)
    & 
    merged_data['Task ID'].isin(repetitive_dwa_task_ids)
    )
    & ~(merged_data['DWA ID'].isna())
).astype(int)



################################################
################################################
################################################
merged_data['dwa_execType_varying'] = (~(merged_data['DWA ID'].isna())).astype(int)
################################################
################################################
################################################



# Remove duplicates in terms of (O*NET-SOC Code, Task ID) if any
print(f'Length of merged_data before dropping duplicates: {merged_data.shape[0]}')
merged_data = merged_data.drop_duplicates(subset=['O*NET-SOC Code', 'Task ID'])
print(f'Length of merged_data after dropping duplicates: {merged_data.shape[0]}')
# Save the updated merged_data with flags
merged_data[merged_data['dwa_execType_varying'] == 1].to_csv(f"{output_data_path}/merged_data_DWAexecVaryingTypes.csv", index=False)


# Summary for flagged DWA rows
mask = merged_data['dwa_execType_varying'] == 1
n_flagged = int(mask.sum())
print(f'\nNumber of dwa_execType_varying rows: {n_flagged}')
if n_flagged > 0:
    for c in ['prev2_is_ai', 'prev_is_ai', 'next_is_ai', 'next2_is_ai']:
        s = int(merged_data.loc[mask, c].sum())
        frac = merged_data.loc[mask, c].mean()
        print(f'{c}: {s} of {n_flagged} flagged rows (fraction={frac:.3f})')
    try:
        display(merged_data.loc[mask].head())
    except Exception:
        print(merged_data.loc[mask].head().to_string(index=False))
else:
    print('No flagged rows to summarize.')


Length of merged_data before dropping duplicates: 17849
Length of merged_data after dropping duplicates: 14437

Number of dwa_execType_varying rows: 14112
prev2_is_ai: 1864 of 14112 flagged rows (fraction=0.132)
prev_is_ai: 1822 of 14112 flagged rows (fraction=0.129)
next_is_ai: 1784 of 14112 flagged rows (fraction=0.126)
next2_is_ai: 1783 of 14112 flagged rows (fraction=0.126)


Unnamed: 0,O*NET-SOC Code,Occupation Title,Task ID,Task Title,Task Position,Task Type,Major_Group_Code,Major_Group_Title,Minor_Group_Code,Minor_Group_Title,...,is_automated,is_exposed,num_tasks,prev_is_ai,prev2_is_ai,next_is_ai,next2_is_ai,DWA ID,DWA Title,dwa_execType_varying
0,11-1011.00,Chief Executives,8825,Analyze operations to evaluate performance of ...,3,Core,11-0000,Management Occupations,11-1000,Top Executives,...,0,0,31,0,1,1,0,4.A.2.a.4.I07.D09,Analyze data to assess operational or project ...,1
1,11-1011.00,Chief Executives,8824,"Confer with board members, organization offici...",4,Core,11-0000,Management Occupations,11-1000,Top Executives,...,0,0,31,1,0,0,0,4.A.4.a.2.I03.D14,Confer with organizational members to accompli...,1
3,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",6,Core,11-0000,Management Occupations,11-1000,Top Executives,...,0,0,31,0,1,1,0,4.A.2.b.1.I09.D01,Implement organizational process or policy cha...,1
6,11-1011.00,Chief Executives,8843,"Interpret and explain policies, rules, regulat...",7,Core,11-0000,Management Occupations,11-1000,Top Executives,...,0,1,31,0,0,0,0,4.A.4.a.1.I02.D03,Communicate organizational policies and proced...,1
8,11-1011.00,Chief Executives,8853,"Prepare bylaws approved by elected officials, ...",9,Supplemental,11-0000,Management Occupations,11-1000,Top Executives,...,0,0,31,0,1,0,0,4.A.2.b.4.I02.D01,Draft legislation or regulations.,1


In [403]:
## Remove rows with no DWA ID, and tasks which appear in only one DWA ID
print(f'Length of merged_data before filtering DWAs with tasks in single occupation: {merged_data.shape[0]}')
merged_data = merged_data[~merged_data['DWA ID'].isna()].reset_index(drop=True)
dwa_occupation_counts = merged_data.groupby('DWA ID')['O*NET-SOC Code'].nunique().reset_index()
dwa_occupation_counts = dwa_occupation_counts.rename(columns={'O*NET-SOC Code': 'num_occupations'})
dwas_to_keep = dwa_occupation_counts[dwa_occupation_counts['num_occupations'] > 1]['DWA ID'].unique().tolist()
merged_data = merged_data[merged_data['DWA ID'].isin(dwas_to_keep)].reset_index(drop=True)
print(f'Length of merged_data after filtering DWAs with tasks in single occupation: {merged_data.shape[0]}')


# Count what fraction of DWAs contain multiple tasks in an occupation
print(f"Length before filtering: {len(merged_data):,}")

# Group by DWA ID and O*NET-SOC Code and count unique Task IDs
flagged_dwa_occupations_ids = merged_data.groupby(['DWA ID', 'O*NET-SOC Code'])['Task ID'].nunique().reset_index()
flagged_dwa_occupations_ids.columns = ['DWA ID', 'O*NET-SOC Code', 'num_unique_task_ids']

# Display some statistics
print(f"\nDWA-Occupation pairs with >1 Task: {(flagged_dwa_occupations_ids['num_unique_task_ids'] > 1).sum():,}")
print(f"DWA-Occupation pairs with exactly 1 Task: {(flagged_dwa_occupations_ids['num_unique_task_ids'] == 1).sum():,}")

# Identify DWA-Occupation pairs to KEEP (those with only 1 unique Task ID)
single_task_pairs = flagged_dwa_occupations_ids[flagged_dwa_occupations_ids['num_unique_task_ids'] == 1][['DWA ID', 'O*NET-SOC Code']]

# Merge to filter - keep only rows that match the single-task pairs
merged_data_filtered = merged_data.merge(
    single_task_pairs,
    on=['DWA ID', 'O*NET-SOC Code'],
    how='inner'
)

print(f"\nLength after filtering: {len(merged_data_filtered):,}")
print(f"Rows removed: {len(merged_data) - len(merged_data_filtered):,} ({100*(len(merged_data) - len(merged_data_filtered))/len(merged_data):.1f}%)")

# Update merged_data
merged_data = merged_data_filtered

Length of merged_data before filtering DWAs with tasks in single occupation: 14437
Length of merged_data after filtering DWAs with tasks in single occupation: 13786
Length before filtering: 13,786

DWA-Occupation pairs with >1 Task: 1,956
DWA-Occupation pairs with exactly 1 Task: 8,769

Length after filtering: 8,769
Rows removed: 5,017 (36.4%)


## Run regression of multiple-execution-type DWA tasks against execution type of neighboring tasks

In [404]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from scipy.stats import norm
from pathlib import Path
import os

# --- Configuration ---
TARGET_REGS = ['prev2_is_ai', 'prev_is_ai', 'next_is_ai', 'next2_is_ai']

# Labels to match your desired output format
VAR_LABELS = {
    'prev2_is_ai': '($t-2$) Task AI',
    'prev_is_ai': '($t-1$) Task AI',
    'next_is_ai': '($t+1$) Task AI',
    'next2_is_ai': '($t+2$) Task AI'
}

# ==========================================
# 1. Robust AME Extractor
# ==========================================
def build_ame_df(res, dataset_name, model_name, target_regs, fe_label, dwa_fe):
    try:
        # --- Calculate Model Statistics ---
        pr2 = res.prsquared
        k = res.params.shape[0]
        adj_pr2 = 1 - (res.llf - k) / res.llnull
        nobs = res.nobs

        # --- Calculate AME ---
        margeff = res.get_margeff(at='overall', method='dydx', dummy=True)
        summary = margeff.summary_frame()
        
        summary = summary.reset_index().rename(columns={'index': 'term'})
        
        rename_map = {
            'dy/dx': 'ame_coef',
            'std err': 'ame_se', 'Std. Err.': 'ame_se',
            'P>|z|': 'p_value', 'z': 'z_score'
        }
        summary = summary.rename(columns=rename_map)
        
        if 'ame_se' not in summary.columns and summary.shape[1] >= 2:
             summary['ame_se'] = summary.iloc[:, 1]

        summary = summary[summary['term'].isin(target_regs)].copy()
        if summary.empty: return pd.DataFrame()

        # --- Manual P-Value Calculation ---
        summary['ame_coef'] = pd.to_numeric(summary['ame_coef'], errors='coerce')
        summary['ame_se'] = pd.to_numeric(summary['ame_se'], errors='coerce')
        
        if 'p_value' not in summary.columns or summary['p_value'].isnull().any():
            z_stat = summary['ame_coef'] / summary['ame_se']
            summary['p_value'] = 2 * (1 - norm.cdf(np.abs(z_stat)))

        df = pd.DataFrame({
            'dataset': dataset_name,
            'model': model_name,
            'fe_label': fe_label,
            'dwa_fe': dwa_fe,
            'nobs': nobs,
            'r2_pseudo': pr2,
            'r2_adj_pseudo': adj_pr2,
            'term': summary['term'],
            'ame_coef': summary['ame_coef'],
            'ame_se': summary['ame_se'],
            'p_value': summary['p_value']
        })
        return df

    except Exception as e:
        print(f"Error calculating AME for {model_name}: {e}")
        return pd.DataFrame()

# ==========================================
# 2. Regression Runner (MODIFIED FOR DWA FE)
# ==========================================
def run_regressions_on(df, dataset_name, dependent_var, regressors):
    df = df.copy()
    all_cols = regressors + [dependent_var, 'is_exposed', 'num_tasks', 'DWA ID']
    existing_cols = [c for c in all_cols if c in df.columns]
    
    # Convert numeric columns but NOT DWA ID (keep it as categorical)
    numeric_cols = [c for c in existing_cols if c != 'DWA ID']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

    base_formula = f'{dependent_var} ~ ' + ' + '.join(regressors)
    ame_list = []
    models = {}

    # ==== WITHOUT DWA Fixed Effects ====
    
    # 1) No SOC FE, No DWA FE
    try:
        formula = base_formula + ' + is_exposed + num_tasks'
        res = smf.logit(formula, data=df).fit(
            disp=False, 
            cov_type='cluster',
            cov_kwds={'groups': df['DWA ID'],
                      'use_correction': True}
        )
        models['no_fe_no_dwa'] = res
        ame_list.append(build_ame_df(res, dataset_name, 'no_fe_no_dwa', regressors, 
                                     fe_label="None", dwa_fe=False))
        print(f"[{dataset_name}] No-FE, No-DWA model converged.")
    except Exception as e: 
        print(f"[{dataset_name}] No-FE, No-DWA failed: {e}")

    # 2) Major FE, No DWA FE
    if 'Major_Group_Code' in df.columns:
        try:
            formula = base_formula + ' + C(Major_Group_Code) + is_exposed + num_tasks'
            df_fe = df.groupby('Major_Group_Code').filter(
                lambda g: g[dependent_var].nunique() == 2# and len(g) >= 10
            )
            res = smf.logit(formula, data=df_fe).fit(
                disp=False, 
                cov_type='cluster',
                cov_kwds={'groups': df_fe['DWA ID'],
                          'use_correction': True}
            )
            models['major_fe_no_dwa'] = res
            ame_list.append(build_ame_df(res, dataset_name, 'major_fe_no_dwa', regressors, 
                                         fe_label="Major Group", dwa_fe=False))
            print(f"[{dataset_name}] Major FE, No-DWA model converged.")
        except Exception as e:
            print(f"[{dataset_name}] Major FE, No-DWA failed: {e}")

    # 3) Minor FE, No DWA FE
    if 'Minor_Group_Code' in df.columns:
        try:
            formula = base_formula + ' + C(Minor_Group_Code) + is_exposed + num_tasks'
            df_fe = df.groupby('Minor_Group_Code').filter(
                lambda g: g[dependent_var].nunique() == 2# and len(g) >= 10
            )
            res = smf.logit(formula, data=df_fe).fit(
                disp=False, 
                cov_type='cluster',
                cov_kwds={'groups': df_fe['DWA ID'],
                          'use_correction': True}
            )
            models['minor_fe_no_dwa'] = res
            ame_list.append(build_ame_df(res, dataset_name, 'minor_fe_no_dwa', regressors, 
                                         fe_label="Minor Group", dwa_fe=False))
            print(f"[{dataset_name}] Minor FE, No-DWA model converged.")
        except Exception as e:
            print(f"[{dataset_name}] Minor FE, No-DWA failed: {e}")

    # ==== WITH DWA Fixed Effects ====
    
    # 4) No SOC FE, With DWA FE
    try:
        formula = base_formula + ' + C(DWA_ID) + is_exposed + num_tasks'
        # Rename column to avoid spaces in formula
        df['DWA_ID'] = df['DWA ID']
        # Filter for DWAs with variation in dependent variable
        df_dwa = df.groupby('DWA_ID').filter(
            lambda g: g[dependent_var].nunique() == 2# and len(g) >= 5
        )
        res = smf.logit(formula, data=df_dwa).fit(
            disp=False, 
            cov_type='cluster',
            cov_kwds={'groups': df_dwa['DWA_ID'],
                      'use_correction': True}
        )
        models['no_fe_with_dwa'] = res
        ame_list.append(build_ame_df(res, dataset_name, 'no_fe_with_dwa', regressors, 
                                     fe_label="None", dwa_fe=True))
        print(f"[{dataset_name}] No-FE, With-DWA model converged.")
    except Exception as e: 
        print(f"[{dataset_name}] No-FE, With-DWA failed: {e}")

    # 5) Major FE, With DWA FE
    if 'Major_Group_Code' in df.columns:
        try:
            formula = base_formula + ' + C(Major_Group_Code) + C(DWA_ID) + is_exposed + num_tasks'
            df['DWA_ID'] = df['DWA ID']
            # Filter for both Major Group and DWA variation
            df_fe = df.groupby(['Major_Group_Code', 'DWA_ID']).filter(
                lambda g: len(g) >= 2
            ).groupby('Major_Group_Code').filter(
                lambda g: g[dependent_var].nunique() == 2# and len(g) >= 10
            ).groupby('DWA_ID').filter(
                lambda g: g[dependent_var].nunique() == 2# and len(g) >= 5
            )
            res = smf.logit(formula, data=df_fe).fit(
                disp=False, 
                cov_type='cluster',
                cov_kwds={'groups': df_fe['DWA_ID'],
                          'use_correction': True}
            )
            models['major_fe_with_dwa'] = res
            ame_list.append(build_ame_df(res, dataset_name, 'major_fe_with_dwa', regressors, 
                                         fe_label="Major Group", dwa_fe=True))
            print(f"[{dataset_name}] Major FE, With-DWA model converged.")
        except Exception as e:
            print(f"[{dataset_name}] Major FE, With-DWA failed: {e}")

    # 6) Minor FE, With DWA FE
    if 'Minor_Group_Code' in df.columns:
        try:
            formula = base_formula + ' + C(Minor_Group_Code) + C(DWA_ID) + is_exposed + num_tasks'
            df['DWA_ID'] = df['DWA ID']
            # Filter for both Minor Group and DWA variation
            df_fe = df.groupby(['Minor_Group_Code', 'DWA_ID']).filter(
                lambda g: len(g) >= 2
            ).groupby('Minor_Group_Code').filter(
                lambda g: g[dependent_var].nunique() == 2# and len(g) >= 10
            ).groupby('DWA_ID').filter(
                lambda g: g[dependent_var].nunique() == 2# and len(g) >= 5
            )
            res = smf.logit(formula, data=df_fe).fit(
                disp=False, 
                cov_type='cluster',
                cov_kwds={'groups': df_fe['DWA_ID'],
                          'use_correction': True}
            )
            models['minor_fe_with_dwa'] = res
            ame_list.append(build_ame_df(res, dataset_name, 'minor_fe_with_dwa', regressors, 
                                         fe_label="Minor Group", dwa_fe=True))
            print(f"[{dataset_name}] Minor FE, With-DWA model converged.")
        except Exception as e:
            print(f"[{dataset_name}] Minor FE, With-DWA failed: {e}")

    # Combine results
    combined = pd.concat(ame_list, ignore_index=True) if ame_list else pd.DataFrame()

    # Save results to CSV
    out_path = f'{output_data_path}/regression_summaries_{dependent_var}'
    os.makedirs(out_path, exist_ok=True)
    combined.to_csv(f'{out_path}/regression_ame_results_{dataset_name}.csv', index=False)

    return models, combined

# ==========================================
# 3. LaTeX Table Generator (MODIFIED FOR 6 COLUMNS)
# ==========================================
def generate_latex_table(df_results):
    if df_results.empty:
        print("No results to tabulate.")
        return

    # Filter for one dataset
    dataset_to_show = df_results['dataset'].unique()[0]
    subset = df_results[df_results['dataset'] == dataset_to_show].copy()
    
    print(f"\n% --- LaTeX Table for {dataset_to_show} ---")

    # --- Formatting ---
    def fmt(row):
        stars = ""
        p = row['p_value']
        if pd.notna(p):
            if p < 0.01: stars = "***"
            elif p < 0.05: stars = "**"
            elif p < 0.10: stars = "*"
        return f"{row['ame_coef']:.2f}{stars}", f"({row['ame_se']:.2f})"

    formatted = subset.apply(fmt, axis=1, result_type='expand')
    subset['coef_str'] = formatted[0]
    subset['se_str'] = formatted[1]

    # Pivot
    pivot_coef = subset.pivot(index='term', columns='model', values='coef_str')
    pivot_se = subset.pivot(index='term', columns='model', values='se_str')

    # Ordering
    valid_vars = [v for v in TARGET_REGS if v in pivot_coef.index]
    pivot_coef = pivot_coef.reindex(valid_vars)
    pivot_se = pivot_se.reindex(valid_vars)
    
    # Define the exact order you want
    model_order = ['no_fe_no_dwa', 'major_fe_no_dwa', 'minor_fe_no_dwa', 
                   'no_fe_with_dwa', 'major_fe_with_dwa', 'minor_fe_with_dwa']
    valid_models = [m for m in model_order if m in pivot_coef.columns]

    # Extract Footer Stats
    stats = subset[['model', 'nobs', 'r2_pseudo', 'r2_adj_pseudo', 'fe_label', 'dwa_fe']].drop_duplicates('model').set_index('model')

    # --- Print LaTeX ---
    col_def = "l" + "c" * len(valid_models) 
    
    print(f"\\begin{{tabular}}{{{col_def}}}")
    print(r"\toprule")
    
    # Header
    header_nums = [f"({i+1})" for i in range(len(valid_models))]
    print(f"Specification & " + " & ".join(header_nums) + r" \\")
    print(r"\midrule")

    # Body (Variables)
    for var in valid_vars:
        label = VAR_LABELS.get(var, var.replace('_', ' '))
        
        # Coefficient Row
        c_vals = [pivot_coef.loc[var, m] if m in pivot_coef.columns and pd.notna(pivot_coef.loc[var, m]) else "" 
                  for m in valid_models]
        print(f"{label} & " + " & ".join(c_vals) + r" \\")
        
        # SE Row
        s_vals = [pivot_se.loc[var, m] if m in pivot_se.columns and pd.notna(pivot_se.loc[var, m]) else "" 
                  for m in valid_models]
        print(f" & " + " & ".join(s_vals) + r" \\")
        print(r"\addlinespace")

    print(r"\midrule")
    
    # --- Footer ---
    
    # Pseudo R2
    r2_vals = [f"{stats.loc[m, 'r2_pseudo']:.3f}" if m in stats.index else "" for m in valid_models]
    print(f"Pseudo $R^2$ & " + " & ".join(r2_vals) + r" \\")
    
    # Adj Pseudo R2
    adj_r2_vals = [f"{stats.loc[m, 'r2_adj_pseudo']:.3f}" if m in stats.index else "" for m in valid_models]
    print(f"Adj. Pseudo $R^2$ & " + " & ".join(adj_r2_vals) + r" \\")
    
    # Observations
    obs_vals = [f"{int(stats.loc[m, 'nobs']):,}" if m in stats.index else "" for m in valid_models]
    print(f"Observations & " + " & ".join(obs_vals) + r" \\")

    # SOC Group Fixed Effects
    fe_vals = []
    for m in valid_models:
        if m in stats.index:
            label = stats.loc[m, 'fe_label']
            if pd.isna(label) or str(label) == "None":
                fe_vals.append("No")
            elif "Major" in str(label):
                fe_vals.append("Major")
            elif "Minor" in str(label):
                fe_vals.append("Minor")
            else:
                fe_vals.append(str(label)[:5])
        else:
            fe_vals.append("")

    print(f"SOC Group FE & " + " & ".join(fe_vals) + r" \\")
    
    # DWA Fixed Effects
    dwa_fe_vals = []
    for m in valid_models:
        if m in stats.index:
            dwa_fe = stats.loc[m, 'dwa_fe']
            dwa_fe_vals.append("Yes" if dwa_fe else "No")
        else:
            dwa_fe_vals.append("")

    print(f"DWA FE & " + " & ".join(dwa_fe_vals) + r" \\")
    
    print(r"\bottomrule")
    print(r"\footnotesize{Standard errors in parentheses (clustered at DWA level). *** p$<$0.01, ** p$<$0.05, * p$<$0.1}")
    print(r"\end{tabular}")

# ==========================================
# 4. Execution Block
# ==========================================
print(">>> Running Regressions on Filtered Data...")
filtered = merged_data[merged_data['dwa_execType_varying'] == 1].reset_index(drop=True)
models_filt, res_filt = run_regressions_on(filtered, 'filtered_0', dependent_var, TARGET_REGS)

# Generate Table
generate_latex_table(res_filt)

>>> Running Regressions on Filtered Data...
[filtered_0] No-FE, No-DWA model converged.
[filtered_0] Major FE, No-DWA model converged.
[filtered_0] Minor FE, No-DWA model converged.
[filtered_0] No-FE, With-DWA model converged.
[filtered_0] Major FE, With-DWA failed: Singular matrix
[filtered_0] Minor FE, With-DWA failed: Singular matrix

% --- LaTeX Table for filtered_0 ---
\begin{tabular}{lcccc}
\toprule
Specification & (1) & (2) & (3) & (4) \\
\midrule
($t-2$) Task AI & 0.07*** & 0.01 & -0.00 & 0.05* \\
 & (0.02) & (0.01) & (0.01) & (0.03) \\
\addlinespace
($t-1$) Task AI & 0.11*** & 0.04*** & 0.03** & 0.09*** \\
 & (0.02) & (0.01) & (0.01) & (0.03) \\
\addlinespace
($t+1$) Task AI & 0.12*** & 0.05*** & 0.04*** & 0.08*** \\
 & (0.03) & (0.02) & (0.02) & (0.03) \\
\addlinespace
($t+2$) Task AI & 0.04** & -0.01 & -0.02 & 0.04 \\
 & (0.02) & (0.01) & (0.01) & (0.03) \\
\addlinespace
\midrule
Pseudo $R^2$ & 0.113 & 0.185 & 0.176 & 0.223 \\
Adj. Pseudo $R^2$ & 0.111 & 0.176 & 0.148 & -0.

In [405]:
# Check if DWAs are nested within Minor SOC
nesting_check = merged_data.groupby('DWA ID')['Minor_Group_Code'].nunique()
print(f"DWAs appearing in multiple Minor SOCs: {(nesting_check > 1).sum()}")
print(f"DWAs in only one Minor SOC: {(nesting_check == 1).sum()}")

if (nesting_check == 1).all():
    print("\n⚠️ DWAs are perfectly nested within Minor SOC codes!")
    print("Including both sets of FE creates perfect collinearity.")


# Check variation after both FE
def check_variation_with_both_fe(df, dependent_var):
    # Create simplified group identifier
    df['group_id'] = df['Minor_Group_Code'].astype(str) + '_' + df['DWA ID'].astype(str)
    
    # Check variation within each Minor-DWA combination
    variation = df.groupby('group_id')[dependent_var].agg(['nunique', 'count', 'mean'])
    
    no_variation = variation[variation['nunique'] == 1]
    print(f"\nMinor-DWA groups with NO variation: {len(no_variation)}/{len(variation)}")
    print(f"Observations in groups with no variation: {no_variation['count'].sum()} out of {len(df)}")
    
    has_variation = variation[variation['nunique'] == 2]
    print(f"Minor-DWA groups WITH variation: {len(has_variation)}")
    print(f"Observations in groups with variation: {has_variation['count'].sum()}")
    
    return variation

variation_check = check_variation_with_both_fe(merged_data, dependent_var)


# Count the number of parameters
n_minor_groups = merged_data['Minor_Group_Code'].nunique()
n_dwas = merged_data['DWA ID'].nunique()
n_obs = len(merged_data)
n_other_vars = len(TARGET_REGS) + 2  # regressors + is_exposed + num_tasks

total_params = n_minor_groups + n_dwas + n_other_vars

print(f"\nNumber of observations: {n_obs:,}")
print(f"Number of parameters to estimate:")
print(f"  - Minor Group FE: {n_minor_groups}")
print(f"  - DWA FE: {n_dwas}")
print(f"  - Other variables: {n_other_vars}")
print(f"  - TOTAL: {total_params:,}")
print(f"\nRatio of parameters to observations: {total_params/n_obs:.2%}")

if total_params > n_obs * 0.1:
    print("⚠️ Warning: Too many parameters relative to sample size!")

DWAs appearing in multiple Minor SOCs: 1332
DWAs in only one Minor SOC: 483

Minor-DWA groups with NO variation: 4489/4869
Observations in groups with no variation: 7441 out of 8769
Minor-DWA groups WITH variation: 380
Observations in groups with variation: 1328

Number of observations: 8,769
Number of parameters to estimate:
  - Minor Group FE: 110
  - DWA FE: 1815
  - Other variables: 6
  - TOTAL: 1,931

Ratio of parameters to observations: 22.02%


In [406]:
# Check if DWAs are nested within Major SOC
nesting_check = merged_data.groupby('DWA ID')['Major_Group_Code'].nunique()
print(f"DWAs appearing in multiple Major SOCs: {(nesting_check > 1).sum()}")
print(f"DWAs in only one Major SOC: {(nesting_check == 1).sum()}")

if (nesting_check == 1).all():
    print("\n⚠️ DWAs are perfectly nested within Major SOC codes!")
    print("Including both sets of FE creates perfect collinearity.")


# Check variation after both FE
def check_variation_with_both_fe(df, dependent_var):
    # Create simplified group identifier
    df['group_id'] = df['Major_Group_Code'].astype(str) + '_' + df['DWA ID'].astype(str)
    
    # Check variation within each Major-DWA combination
    variation = df.groupby('group_id')[dependent_var].agg(['nunique', 'count', 'mean'])
    
    no_variation = variation[variation['nunique'] == 1]
    print(f"\nMajor-DWA groups with NO variation: {len(no_variation)}/{len(variation)}")
    print(f"Observations in groups with no variation: {no_variation['count'].sum()} out of {len(df)}")
    
    has_variation = variation[variation['nunique'] == 2]
    print(f"Major-DWA groups WITH variation: {len(has_variation)}")
    print(f"Observations in groups with variation: {has_variation['count'].sum()}")
    
    return variation

variation_check = check_variation_with_both_fe(merged_data, dependent_var)


# Count the number of parameters
n_major_groups = merged_data['Major_Group_Code'].nunique()
n_dwas = merged_data['DWA ID'].nunique()
n_obs = len(merged_data)
n_other_vars = len(TARGET_REGS) + 2  # regressors + is_exposed + num_tasks

total_params = n_major_groups + n_dwas + n_other_vars

print(f"\nNumber of observations: {n_obs:,}")
print(f"Number of parameters to estimate:")
print(f"  - Major Group FE: {n_major_groups}")
print(f"  - DWA FE: {n_dwas}")
print(f"  - Other variables: {n_other_vars}")
print(f"  - TOTAL: {total_params:,}")
print(f"\nRatio of parameters to observations: {total_params/n_obs:.2%}")

if total_params > n_obs * 0.1:
    print("⚠️ Warning: Too many parameters relative to sample size!")

DWAs appearing in multiple Major SOCs: 447
DWAs in only one Major SOC: 1368

Major-DWA groups with NO variation: 1974/2426
Observations in groups with no variation: 6008 out of 8769
Major-DWA groups WITH variation: 452
Observations in groups with variation: 2761

Number of observations: 8,769
Number of parameters to estimate:
  - Major Group FE: 22
  - DWA FE: 1815
  - Other variables: 6
  - TOTAL: 1,843

Ratio of parameters to observations: 21.02%


In [407]:
# how many minors per DWA?
(merged_data[merged_data['dwa_execType_varying'] == 1].groupby('DWA ID')['Minor_Group_Code'].nunique().value_counts().sort_index())

Minor_Group_Code
1     483
2     538
3     389
4     187
5     105
6      54
7      17
8      19
9       7
10      7
11      1
12      1
14      2
16      1
17      1
19      2
20      1
Name: count, dtype: int64

In [408]:
# how many majors per DWA?
(merged_data[merged_data['dwa_execType_varying'] == 1].groupby('DWA ID')['Major_Group_Code'].nunique().value_counts().sort_index())

Major_Group_Code
1    1368
2     347
3      56
4      33
5       6
6       1
7       4
Name: count, dtype: int64

In [410]:
df = merged_data[merged_data['dwa_execType_varying'] == 1].copy()
X = df.groupby(['Minor_Group_Code', 'DWA ID']).filter(
                lambda g: len(g) >= 2
            ).groupby('Minor_Group_Code').filter(
                lambda g: g[dependent_var].nunique() == 2# and len(g) >= 10
            ).groupby('DWA ID').filter(
                lambda g: g[dependent_var].nunique() == 2# and len(g) >= 5
            )
X

Unnamed: 0,O*NET-SOC Code,Occupation Title,Task ID,Task Title,Task Position,Task Type,Major_Group_Code,Major_Group_Title,Minor_Group_Code,Minor_Group_Title,...,is_exposed,num_tasks,prev_is_ai,prev2_is_ai,next_is_ai,next2_is_ai,DWA ID,DWA Title,dwa_execType_varying,group_id
43,11-2011.00,Advertising and Promotions Managers,3240,"Assemble and communicate with a strong, divers...",10,Supplemental,11-0000,Management Occupations,11-2000,"Advertising, Marketing, Promotions, Public Rel...",...,0,21,0,0,1,0,4.A.4.a.4.I01.D04,Establish interpersonal business relationships...,1,11-0000_4.A.4.a.4.I01.D04
50,11-2011.00,Advertising and Promotions Managers,3236,Provide presentation and product demonstration...,17,Supplemental,11-0000,Management Occupations,11-2000,"Advertising, Marketing, Promotions, Public Rel...",...,1,21,0,0,1,0,4.A.4.b.6.I02.D02,Advise customers on technical or procedural is...,1,11-0000_4.A.4.b.6.I02.D02
55,11-2021.00,Marketing Managers,962,"Advise business or other groups on local, nati...",5,Supplemental,11-0000,Management Occupations,11-2000,"Advertising, Marketing, Promotions, Public Rel...",...,0,20,0,1,0,0,4.A.4.b.6.I05.D10,Advise others on business or operational matters.,1,11-0000_4.A.4.b.6.I05.D10
58,11-2021.00,Marketing Managers,950,"Develop pricing strategies, balancing firm obj...",13,Core,11-0000,Management Occupations,11-2000,"Advertising, Marketing, Promotions, Public Rel...",...,0,20,0,0,0,0,4.A.2.b.4.I01.D06,Determine pricing or monetary policies.,1,11-0000_4.A.2.b.4.I01.D06
64,11-2022.00,Sales Managers,4,Determine price schedules and discount rates.,5,Core,11-0000,Management Occupations,11-2000,"Advertising, Marketing, Promotions, Public Rel...",...,0,17,0,0,0,0,4.A.2.b.4.I01.D06,Determine pricing or monetary policies.,1,11-0000_4.A.2.b.4.I01.D06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8674,53-7061.00,Cleaners of Vehicles and Equipment,4992,"Inspect parts, equipment, or vehicles for clea...",5,Core,53-0000,Transportation and Material Moving Occupations,53-7000,Material Moving Workers,...,0,21,0,0,0,0,4.A.1.b.2.I04.D01,Inspect motor vehicles.,1,53-0000_4.A.1.b.2.I04.D01
8725,53-7071.00,Gas Compressor and Gas Pumping Station Operators,14600,Record instrument readings and operational cha...,3,Core,53-0000,Transportation and Material Moving Occupations,53-7000,Material Moving Workers,...,0,13,0,0,0,0,4.A.3.b.6.I08.D03,Record operational or production data.,1,53-0000_4.A.3.b.6.I08.D03
8727,53-7071.00,Gas Compressor and Gas Pumping Station Operators,14604,Take samples of gases and conduct chemical tes...,10,Core,53-0000,Transportation and Material Moving Occupations,53-7000,Material Moving Workers,...,0,13,0,0,0,0,4.A.1.b.2.I03.D01,"Test materials, solutions, or samples.",1,53-0000_4.A.1.b.2.I03.D01
8734,53-7072.00,"Pump Operators, Except Wellhead Pumpers",14622,"Test materials and solutions, using testing eq...",10,Core,53-0000,Transportation and Material Moving Occupations,53-7000,Material Moving Workers,...,0,14,0,0,0,0,4.A.1.b.2.I03.D01,"Test materials, solutions, or samples.",1,53-0000_4.A.1.b.2.I03.D01
