#### By: Peyman Shahidi
#### Created: Oct 22, 2025
#### Last Edit: Oct 22, 2025

<br>

In [40]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [41]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects'
output_plot_path = f"{main_folder_path}/writeup/plots/taskSequence_vs_anthropicIndex"

In [42]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [43]:
# Read O*NET data
ONET = pd.read_csv(f'{input_data_path}/computed_objects/ONET_cleaned_tasks.csv')

# Drop  columns to avoid double counting
# Note: In ~4k instances, the same task is mapped to multiple DWAs
ONET = ONET.drop(columns=['DWA ID', 'DWA Title'])

# Remove duplicate rows
rows_before = len(ONET)
print(f"Number of rows before removing duplicates: {rows_before:,}")
ONET = ONET.drop_duplicates().reset_index(drop=True)
rows_after = len(ONET)
print(f"Number of rows after removing duplicates: {rows_after:,}")
print(f"Duplicates removed: {rows_before - rows_after}")

# Print length of dataset
print(f"Number of rows in ONET dataset: {len(ONET):,}")

ONET.head(5)

Number of rows before removing duplicates: 22,310
Number of rows after removing duplicates: 17,953
Duplicates removed: 4357
Number of rows in ONET dataset: 17,953


Unnamed: 0,O*NET-SOC Code,Occupation Title,Task ID,Task Title,Task Type,Job Zone,Task_Time_Percentage,Hourly_Mean_Wage,Hourly_P10_Wage,Hourly_P25_Wage,...,Relevance,Base_SOC_Code,Major_Group_Code,Major_Group_Title,Minor_Group_Code,Minor_Group_Title,Broad_Occupation_Code,Broad_Occupation_Title,Detailed_Occupation_Code,Detailed_Occupation_Title
0,11-1011.00,Chief Executives,8823,Direct or coordinate an organization's financi...,Core,5,9.62,124.47,38.46,62.9,...,94.19,11-1011,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
1,11-1011.00,Chief Executives,8824,"Confer with board members, organization offici...",Core,5,9.49,124.47,38.46,62.9,...,98.79,11-1011,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
2,11-1011.00,Chief Executives,8825,Analyze operations to evaluate performance of ...,Core,5,9.22,124.47,38.46,62.9,...,100.0,11-1011,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
3,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",Core,5,10.26,124.47,38.46,62.9,...,95.84,11-1011,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives
4,11-1011.00,Chief Executives,8827,"Prepare budgets for approval, including those ...",Core,5,1.46,124.47,38.46,62.9,...,90.47,11-1011,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,11-1011,Chief Executives


In [44]:
# Load GPTs are GPTs full label dataset
gpts_full_labels = pd.read_csv(f'{input_data_path}/GPTs-are-GPTs-main/data/full_labelset.tsv', sep="\t")

# Keep relevant columns only
gpts_full_labels = gpts_full_labels[['O*NET-SOC Code', 'Task ID', 'Task', 'Task Type', 'Title', 'gpt4_exposure', 'human_labels']]

# Convert Task ID to integer
gpts_full_labels['Task ID'] = gpts_full_labels['Task ID'].astype(int)

# Remove apostrophes for consistency
gpts_full_labels = gpts_full_labels.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)

# Rename columns
gpts_full_labels = gpts_full_labels.rename(columns={
    'Task': 'Task Title',
    'Title': 'Occupation Title'
})

# Print length of dataset
print(f"Number of rows in GPTs full labels dataset: {len(gpts_full_labels):,}")


gpts_full_labels.head(5)

Number of rows in GPTs full labels dataset: 19,265


Unnamed: 0,O*NET-SOC Code,Task ID,Task Title,Task Type,Occupation Title,gpt4_exposure,human_labels
0,11-1011.00,8823,Direct or coordinate an organizations financia...,Core,Chief Executives,E2,E0
1,11-1011.00,8831,Appoint department heads or managers and assig...,Core,Chief Executives,E0,E0
2,11-1011.00,8825,Analyze operations to evaluate performance of ...,Core,Chief Executives,E2,E2
3,11-1011.00,8826,"Direct, plan, or implement policies, objective...",Core,Chief Executives,E2,E0
4,11-1011.00,8827,"Prepare budgets for approval, including those ...",Core,Chief Executives,E2,E2


In [45]:
# Merge with ONET dataset to get hierarchical codes and titles
ONET = ONET.merge(gpts_full_labels, on=['O*NET-SOC Code', 'Occupation Title', 'Task ID', 'Task Title', 'Task Type'], how='left')

# Check how many tasks were not matched
unmatched_tasks = ONET[ONET['gpt4_exposure'].isna()]
print(f"Number of unmatched tasks: {len(unmatched_tasks):,}")

Number of unmatched tasks: 791


In [46]:
# Merge with Anthropic exposure data
anthropic_exposure = pd.read_csv(f'{input_data_path}/Anthropic_EconomicIndex/automation_vs_augmentation_by_task.csv')

# Remove if all entries are filtered:
anthropic_exposure = anthropic_exposure[anthropic_exposure['filtered'] != 1].reset_index(drop=True)

# Create new columns:
# Sum feedback loop and directive into Automation
# Sum validation, iteration, and learning into Augmentation
anthropic_exposure['automation'] = anthropic_exposure.apply(lambda row: row['feedback_loop'] + row['directive'], axis=1)
anthropic_exposure['augmentation'] = anthropic_exposure.apply(lambda row: row['validation'] + row['task_iteration'] + row['learning'], axis=1)

# Assign labels: take the max of automation, augmentation, manual and assign the corresponding label
def assign_label(row):
    max_value = max(row['automation'], row['augmentation'])
    if max_value == row['automation']:
        return 'Automation'
    elif max_value == row['augmentation']:
        return 'Augmentation'

anthropic_exposure['label'] = anthropic_exposure.apply(assign_label, axis=1)

# Filter to only keep the relevant columns
anthropic_exposure = anthropic_exposure[['task_name', 'automation', 'augmentation', 'label']]
anthropic_exposure

Unnamed: 0,task_name,automation,augmentation,label
0,act as advisers to student organizations.,0.38,0.36,Automation
1,act as an intermediary in negotiations between...,0.39,0.43,Augmentation
2,act as an intermediary in negotiations between...,0.39,0.38,Automation
3,act as liaisons between on-site managers or te...,0.56,0.00,Automation
4,adapt instructional content or delivery method...,0.29,0.70,Augmentation
...,...,...,...,...
2293,"write, prepare, and deliver statements for the...",0.00,0.79,Augmentation
2294,"write, present, and publish reports that recor...",0.51,0.47,Automation
2295,"write, review, or execute plans for testing ne...",0.40,0.00,Automation
2296,"write, review, or maintain engineering documen...",0.35,0.62,Augmentation


In [47]:
# Print number of unique tasks in ONET dataset
print(f"Number of unique tasks in ONET dataset: {ONET['Task Title'].nunique():,}")

# Print number of unique tasks in Anthropic exposure dataset
print(f"Number of unique tasks in Anthropic exposure dataset: {anthropic_exposure['task_name'].nunique():,}")


# Add normalized task title to ONET for merging
ONET["task_normalized"] = ONET["Task Title"].str.lower().str.strip()


# Merge ONET and Anthropic exposure data on the normalized task title
merged_data = pd.merge(ONET, anthropic_exposure[['task_name', 'automation', 'augmentation', 'label']], left_on="task_normalized", right_on="task_name", how="left")

# Fill the NaN values of the label column in merged dataset as Manual
merged_data['label'] = merged_data['label'].fillna('Manual')

# Print distribution after filling NaN values
print(f"\nDistribution of labels after filling NaN values with 'Manual':")
print(merged_data['label'].value_counts())
print(f"Total tasks: {len(merged_data):,}")

merged_data.head()

Number of unique tasks in ONET dataset: 16,913
Number of unique tasks in Anthropic exposure dataset: 2,298

Distribution of labels after filling NaN values with 'Manual':
label
Manual          15605
Augmentation     1626
Automation        722
Name: count, dtype: int64
Total tasks: 17,953


Unnamed: 0,O*NET-SOC Code,Occupation Title,Task ID,Task Title,Task Type,Job Zone,Task_Time_Percentage,Hourly_Mean_Wage,Hourly_P10_Wage,Hourly_P25_Wage,...,Broad_Occupation_Title,Detailed_Occupation_Code,Detailed_Occupation_Title,gpt4_exposure,human_labels,task_normalized,task_name,automation,augmentation,label
0,11-1011.00,Chief Executives,8823,Direct or coordinate an organization's financi...,Core,5,9.62,124.47,38.46,62.9,...,Chief Executives,11-1011,Chief Executives,,,direct or coordinate an organization's financi...,direct or coordinate an organization's financi...,0.35,0.57,Augmentation
1,11-1011.00,Chief Executives,8824,"Confer with board members, organization offici...",Core,5,9.49,124.47,38.46,62.9,...,Chief Executives,11-1011,Chief Executives,E0,E0,"confer with board members, organization offici...","confer with board members, organization offici...",0.25,0.61,Augmentation
2,11-1011.00,Chief Executives,8825,Analyze operations to evaluate performance of ...,Core,5,9.22,124.47,38.46,62.9,...,Chief Executives,11-1011,Chief Executives,E2,E2,analyze operations to evaluate performance of ...,analyze operations to evaluate performance of ...,0.31,0.66,Augmentation
3,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",Core,5,10.26,124.47,38.46,62.9,...,Chief Executives,11-1011,Chief Executives,E2,E0,"direct, plan, or implement policies, objective...",,,,Manual
4,11-1011.00,Chief Executives,8827,"Prepare budgets for approval, including those ...",Core,5,1.46,124.47,38.46,62.9,...,Chief Executives,11-1011,Chief Executives,E2,E2,"prepare budgets for approval, including those ...",,,,Manual


In [48]:
# Read task sequence data
# Go through computed_objects/tasks_sequences and read all files and merge with original ONET data
# Get list of files in the directory
task_sequence_files = [f for f in os.listdir(f'{input_data_path}/computed_objects/tasks_sequences') if f.endswith('.csv')]
task_sequence = pd.concat([pd.read_csv(f'{input_data_path}/computed_objects/tasks_sequences/{file}') for file in task_sequence_files], ignore_index=True)

print(f"Task sequence data shape: {task_sequence.shape}")
print(f"Columns in task sequence: {list(task_sequence.columns)}")

# Merge task sequence data with merged_data
merged_data = pd.merge(merged_data, task_sequence[['O*NET-SOC Code', 'Task ID', 'Task Position']], on=['O*NET-SOC Code', 'Task ID'], how='left')
merged_data = merged_data[merged_data['Task Position'].notna()].reset_index(drop=True)


# Read occupation sequence data
# Go through computed_objects/occupation_sequences and read all files and merge with original ONET data
# Get list of files in the directory
occupation_sequence_files = [f for f in os.listdir(f'{input_data_path}/computed_objects/occupation_sequences') if f.endswith('.csv')]
occupation_sequence = pd.concat([pd.read_csv(f'{input_data_path}/computed_objects/occupation_sequences/{file}') for file in occupation_sequence_files], ignore_index=True)

print(f"Occupation sequence data shape: {occupation_sequence.shape}")
print(f"Columns in occupation sequence: {list(occupation_sequence.columns)}")

# Merge occupation sequence data with merged_data
merged_data = pd.merge(merged_data, occupation_sequence[['O*NET-SOC Code', 'Occupation Position']], on=['O*NET-SOC Code'], how='left')
merged_data = merged_data[merged_data['Occupation Position'].notna()].reset_index(drop=True)
merged_data.head(5)

Task sequence data shape: (17926, 5)
Columns in task sequence: ['Task Position', 'Task Title', 'Task ID', 'O*NET-SOC Code', 'Occupation Title']
Occupation sequence data shape: (718, 5)
Columns in occupation sequence: ['Occupation Position', 'Occupation Title', 'O*NET-SOC Code', 'Minor_Group_Code', 'Minor_Group_Code.1']


Unnamed: 0,O*NET-SOC Code,Occupation Title,Task ID,Task Title,Task Type,Job Zone,Task_Time_Percentage,Hourly_Mean_Wage,Hourly_P10_Wage,Hourly_P25_Wage,...,Detailed_Occupation_Title,gpt4_exposure,human_labels,task_normalized,task_name,automation,augmentation,label,Task Position,Occupation Position
0,11-1011.00,Chief Executives,8823,Direct or coordinate an organization's financi...,Core,5,9.62,124.47,38.46,62.9,...,Chief Executives,,,direct or coordinate an organization's financi...,direct or coordinate an organization's financi...,0.35,0.57,Augmentation,12.0,1.0
1,11-1011.00,Chief Executives,8824,"Confer with board members, organization offici...",Core,5,9.49,124.47,38.46,62.9,...,Chief Executives,E0,E0,"confer with board members, organization offici...","confer with board members, organization offici...",0.25,0.61,Augmentation,4.0,1.0
2,11-1011.00,Chief Executives,8825,Analyze operations to evaluate performance of ...,Core,5,9.22,124.47,38.46,62.9,...,Chief Executives,E2,E2,analyze operations to evaluate performance of ...,analyze operations to evaluate performance of ...,0.31,0.66,Augmentation,3.0,1.0
3,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",Core,5,10.26,124.47,38.46,62.9,...,Chief Executives,E2,E0,"direct, plan, or implement policies, objective...",,,,Manual,6.0,1.0
4,11-1011.00,Chief Executives,8827,"Prepare budgets for approval, including those ...",Core,5,1.46,124.47,38.46,62.9,...,Chief Executives,E2,E2,"prepare budgets for approval, including those ...",,,,Manual,10.0,1.0


In [49]:
# # Reshuffle task assignments while preserving SOC hierarchy structure
# # Each occupation unit (O*NET-SOC Code + hierarchy levels) stays intact, but tasks are randomly reassigned
# # COMPLETELY UNRESTRICTED: Each task assigned to random occupation, no constraints on tasks per occupation

# # Create a copy of the data for reshuffling
# reshuffled_data = merged_data.copy()

# # Create SOC hierarchy levels with descriptive names from the O*NET-SOC Code
# reshuffled_data['soc_major_group'] = reshuffled_data['O*NET-SOC Code'].str[:2]
# reshuffled_data['soc_minor_group'] = reshuffled_data['O*NET-SOC Code'].str[:5] 
# reshuffled_data['soc_broad_occupation'] = reshuffled_data['O*NET-SOC Code'].str[:8]
# reshuffled_data['soc_detailed_occupation'] = reshuffled_data['O*NET-SOC Code']

# # Get unique occupation units (with all hierarchy levels intact)
# occupation_units = reshuffled_data[['O*NET-SOC Code', 'Occupation Title', 'soc_major_group', 
#                                    'soc_minor_group', 'soc_broad_occupation', 'soc_detailed_occupation']].drop_duplicates()

# print(f"Number of unique occupation units: {len(occupation_units):,}")

# # Get all unique tasks (each task appears only once)
# unique_tasks = reshuffled_data[['Task ID', 'Task Title', 'Task Type', 'gpt4_exposure', 
#                                'human_labels', 'task_normalized', 'task_name', 
#                                'automation', 'augmentation', 'label']].drop_duplicates()

# print(f"Number of unique tasks: {len(unique_tasks):,}")

# # Set random seed for reproducibility
# np.random.seed(42)

# # UNRESTRICTED ASSIGNMENT: Each task randomly assigned to any occupation unit
# # No constraints on how many tasks per occupation
# n_tasks = len(unique_tasks)
# n_occupations = len(occupation_units)

# # Random assignment: each task gets assigned to a completely random occupation
# random_occupation_indices = np.random.choice(n_occupations, size=n_tasks, replace=True)

# print(f"Assigning {n_tasks:,} tasks randomly across {n_occupations:,} occupation units")

# # Create the reshuffled dataset
# reshuffled_list = []

# for task_idx in range(n_tasks):
#     occ_idx = random_occupation_indices[task_idx]
    
#     # Get task information
#     task_row = unique_tasks.iloc[task_idx].to_dict()
    
#     # Get occupation information  
#     occ_row = occupation_units.iloc[occ_idx].to_dict()
    
#     # Combine them
#     combined_row = {**task_row, **occ_row}
#     reshuffled_list.append(combined_row)

# # Create the reshuffled DataFrame
# reshuffled_data = pd.DataFrame(reshuffled_list)

# print(f"Reshuffled dataset created with {len(reshuffled_data):,} rows")

# # Verify the reshuffling worked correctly
# print(f"\nOriginal dataset task distribution by occupation:")
# original_task_counts = merged_data.groupby('O*NET-SOC Code')['Task ID'].nunique()
# print(f"Min tasks per occupation: {original_task_counts.min()}")
# print(f"Max tasks per occupation: {original_task_counts.max()}")
# print(f"Mean tasks per occupation: {original_task_counts.mean():.2f}")
# print(f"Std tasks per occupation: {original_task_counts.std():.2f}")

# print(f"\nReshuffled dataset task distribution by occupation:")
# reshuffled_task_counts = reshuffled_data.groupby('O*NET-SOC Code')['Task ID'].nunique()
# print(f"Min tasks per occupation: {reshuffled_task_counts.min()}")
# print(f"Max tasks per occupation: {reshuffled_task_counts.max()}")
# print(f"Mean tasks per occupation: {reshuffled_task_counts.mean():.2f}")
# print(f"Std tasks per occupation: {reshuffled_task_counts.std():.2f}")

# # Count occupations with zero tasks
# zero_task_occupations = (reshuffled_task_counts == 0).sum()
# print(f"Occupations with zero tasks: {zero_task_occupations}")

# print(f"\nVerification:")
# print(f"Number of unique occupations preserved: {reshuffled_data['O*NET-SOC Code'].nunique() == merged_data['O*NET-SOC Code'].nunique()}")
# print(f"Number of unique tasks preserved: {reshuffled_data['Task ID'].nunique() == unique_tasks['Task ID'].nunique()}")
# print(f"Each task appears exactly once: {len(reshuffled_data) == len(unique_tasks)}")

# # Show SOC hierarchy column names
# print(f"\nSOC hierarchy columns created:")
# soc_columns = ['soc_major_group', 'soc_minor_group', 'soc_broad_occupation', 'soc_detailed_occupation']
# for col in soc_columns:
#     print(f"  {col}: {reshuffled_data[col].nunique():,} unique values")


# # Set reshuffled data as the final dataset
# merged_data = reshuffled_data.copy()

# # Show sample of reshuffled data
# print(f"\nSample of reshuffled data:")
# reshuffled_data.head()

In [50]:
# Create 'Realized_Handoff' column: default 0, then set to 1 for tasks that are the last task in their occupation (by Task Position).
# Then manually set the Realized_Handoff of the last task in the last occupation in the minor group to 0 as requested.
# Safety: check for required columns and create soc_minor_group if missing.
required_cols = ['O*NET-SOC Code', 'Task Position']

# Ensure Task Position numeric
merged_data['Task Position'] = pd.to_numeric(merged_data['Task Position'], errors='coerce')

# Initialize Realized_Handoff to 0
merged_data['Realized_Handoff'] = 0

# For each occupation, find the max Task Position and set Realized_Handoff=1 for rows matching it
last_pos = merged_data.groupby('O*NET-SOC Code')['Task Position'].transform('max')
merged_data.loc[merged_data['Task Position'] == last_pos, 'Realized_Handoff'] = 1
print('Set Realized_Handoff=1 for last-position tasks within each occupation (by O*NET-SOC Code).')

# Now find the last Occupation Position in Minor_Group_Code then set that occupation's last task Realized_Handoff to 0. 

# Determine the last occupation in the Minor_Group_Code sequence and set Realized_Handoff=0 for its last task
merged_data['Last_Occupation'] = 0
last_pos = merged_data.groupby('Minor_Group_Code')['Occupation Position'].transform('max')
merged_data.loc[merged_data['Occupation Position'] == last_pos, 'Last_Occupation'] = 1

# Set the Realized_Handoff of all tasks for Last_Occupation to 0
merged_data.loc[merged_data['Last_Occupation'] == 1, 'Realized_Handoff'] = 0

# Drop the helper column
merged_data = merged_data.drop(columns=['Last_Occupation'])
display(merged_data.head(5))

Set Realized_Handoff=1 for last-position tasks within each occupation (by O*NET-SOC Code).


Unnamed: 0,O*NET-SOC Code,Occupation Title,Task ID,Task Title,Task Type,Job Zone,Task_Time_Percentage,Hourly_Mean_Wage,Hourly_P10_Wage,Hourly_P25_Wage,...,gpt4_exposure,human_labels,task_normalized,task_name,automation,augmentation,label,Task Position,Occupation Position,Realized_Handoff
0,11-1011.00,Chief Executives,8823,Direct or coordinate an organization's financi...,Core,5,9.62,124.47,38.46,62.9,...,,,direct or coordinate an organization's financi...,direct or coordinate an organization's financi...,0.35,0.57,Augmentation,12.0,1.0,0
1,11-1011.00,Chief Executives,8824,"Confer with board members, organization offici...",Core,5,9.49,124.47,38.46,62.9,...,E0,E0,"confer with board members, organization offici...","confer with board members, organization offici...",0.25,0.61,Augmentation,4.0,1.0,0
2,11-1011.00,Chief Executives,8825,Analyze operations to evaluate performance of ...,Core,5,9.22,124.47,38.46,62.9,...,E2,E2,analyze operations to evaluate performance of ...,analyze operations to evaluate performance of ...,0.31,0.66,Augmentation,3.0,1.0,0
3,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",Core,5,10.26,124.47,38.46,62.9,...,E2,E0,"direct, plan, or implement policies, objective...",,,,Manual,6.0,1.0,0
4,11-1011.00,Chief Executives,8827,"Prepare budgets for approval, including those ...",Core,5,1.46,124.47,38.46,62.9,...,E2,E2,"prepare budgets for approval, including those ...",,,,Manual,10.0,1.0,0


In [51]:
# Plot task sequences for all occupations within each Minor_Group_Code in a single figure
# Save each minor-group figure under output_plot_path/occupation_sequence/<minor_group>/
import os
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import matplotlib.patches as patches

# Determine which minor-group column to use/create
if 'Minor_Group_Code' in merged_data.columns:
    minor_col = 'Minor_Group_Code'
elif 'soc_minor_group' in merged_data.columns:
    minor_col = 'soc_minor_group'
else:
    merged_data['Minor_Group_Code'] = merged_data['O*NET-SOC Code'].astype(str).str[:5] + '-000'
    minor_col = 'Minor_Group_Code'

# Ensure Task Position numeric
merged_data['Task Position'] = pd.to_numeric(merged_data['Task Position'], errors='coerce')

base_out = os.path.join(output_plot_path, 'occupation_sequence')
os.makedirs(base_out, exist_ok=True)

# color map for labels
color_map = {'Manual': 'lightgray', 'Augmentation': 'orange', 'Automation': 'green'}

# iterate over minor groups
minor_groups = sorted(merged_data[minor_col].dropna().unique())
print(f'Starting group plots for {len(minor_groups)} minor groups; saving under: {base_out}')

for mg in minor_groups:
    mg_df = merged_data[merged_data[minor_col] == mg].copy()
    if mg_df.empty:
        continue
    # occupations in this minor group
    occ_codes = sorted(mg_df['O*NET-SOC Code'].dropna().unique())
    if len(occ_codes) == 0:
        continue
    # compute max tasks in any occupation to align bands
    occ_task_counts = {oc: len(mg_df[mg_df['O*NET-SOC Code'] == oc]) for oc in occ_codes}
    max_tasks = max(occ_task_counts.values())
    # vertical spacing: each occupation gets a band of height (max_tasks + 1)
    band_h = max_tasks + 1
    total_height = band_h * len(occ_codes)
    fig, ax = plt.subplots(figsize=(16, max(6, total_height * 0.25)))

    # plot each occupation in its band
    for j, occ in enumerate(reversed(occ_codes)):  # reversed so first occ is at top
        occ_df = mg_df[mg_df['O*NET-SOC Code'] == occ].sort_values('Task Position').reset_index(drop=True)
        n_tasks = len(occ_df)
        # base y coordinate for this occupation's band
        base_y = j * band_h
        # place tasks inside band from top to bottom
        for i, (_, row) in enumerate(occ_df.iterrows()):
            # position within band: align to top of band
            y_pos = base_y + (band_h - 1) - i
            # rectangle
            rect = Rectangle((0, y_pos), 10, 0.8, facecolor=color_map.get(row.get('label','Manual'), 'lightgray'), edgecolor='black', linewidth=1)
            ax.add_patch(rect)
            # task title (truncated)
            tt = str(row.get('Task Title',''))
            if len(tt) > 60:
                tt = tt[:60] + '...'
            ax.text(0.2, y_pos + 0.4, tt, ha='left', va='center', fontsize=8, wrap=True)
            # position number
            try:
                tp = int(row.get('Task Position'))
                ax.text(-0.5, y_pos + 0.4, f'{tp}', ha='right', va='center', fontweight='bold', fontsize=8)
            except Exception:
                pass
            # label on right
            ax.text(10.2, y_pos + 0.4, row.get('label',''), ha='left', va='center', fontweight='bold', fontsize=8, color=color_map.get(row.get('label',''), 'black'))
            # if Realized_Handoff == 1, add red 'Handoff' text to the far right
            if row.get('Realized_Handoff', 0) == 1:
                ax.text(11.8, y_pos + 0.4, 'Handoff', ha='left', va='center', fontsize=9, color='red', fontweight='bold')
        # add occupation title label at right of band
        occ_title = occ_df['Occupation Title'].iloc[0] if len(occ_df) > 0 else ''
        safe_occ_title = (occ_title[:80] + '...') if len(occ_title) > 80 else occ_title
        ax.text(10.2, base_y + band_h - 0.2, f'{occ} - {safe_occ_title}', ha='left', va='bottom', fontsize=9, fontweight='bold')

    # final plot adjustments
    ax.set_xlim(-2, 14)
    ax.set_ylim(-0.5, total_height)
    ax.set_xticks([])
    ax.set_yticks([])
    for spine in ['top','right','bottom','left']:
        ax.spines[spine].set_visible(False)
    plt.title(f'Task sequences for minor group: {mg} (occupations stacked)', fontsize=14, fontweight='bold')
    # legend
    legend_elements = [patches.Patch(color='lightgray', label='Manual'), patches.Patch(color='orange', label='Augmentation'), patches.Patch(color='green', label='Automation') ]
    ax.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1.15, 1))
    plt.tight_layout()

    # save into a subfolder per minor group
    fname = os.path.join(base_out, f'occupation_task_sequences_{mg}.png')
    fig.savefig(fname, dpi=300, bbox_inches='tight')
    plt.close(fig)
    print(f'Saved minor-group plot: {fname}')

print('All minor-group occupation-sequence plots completed.')

Starting group plots for 95 minor groups; saving under: ../writeup/plots/taskSequence_vs_anthropicIndex/occupation_sequence
Saved minor-group plot: ../writeup/plots/taskSequence_vs_anthropicIndex/occupation_sequence/occupation_task_sequences_11-1000.png
Saved minor-group plot: ../writeup/plots/taskSequence_vs_anthropicIndex/occupation_sequence/occupation_task_sequences_11-1000.png
Saved minor-group plot: ../writeup/plots/taskSequence_vs_anthropicIndex/occupation_sequence/occupation_task_sequences_11-2000.png
Saved minor-group plot: ../writeup/plots/taskSequence_vs_anthropicIndex/occupation_sequence/occupation_task_sequences_11-2000.png
Saved minor-group plot: ../writeup/plots/taskSequence_vs_anthropicIndex/occupation_sequence/occupation_task_sequences_11-3000.png
Saved minor-group plot: ../writeup/plots/taskSequence_vs_anthropicIndex/occupation_sequence/occupation_task_sequences_11-3000.png
Saved minor-group plot: ../writeup/plots/taskSequence_vs_anthropicIndex/occupation_sequence/occu