#### By: Peyman Shahidi
#### Created: Oct 21, 2025
#### Last Edit: Oct 21, 2025

<br>

In [None]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [None]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects'
output_plot_path = f"{main_folder_path}/writeup/plots/taskSequence_vs_anthropicIndex"

In [None]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
# Read O*NET data
ONET = pd.read_csv(f'{input_data_path}/computed_objects/ONET_cleaned_tasks.csv')

# Drop  columns to avoid double counting
# Note: In ~4k instances, the same task is mapped to multiple DWAs
ONET = ONET.drop(columns=['DWA ID', 'DWA Title'])

# Remove duplicate rows
rows_before = len(ONET)
print(f"Number of rows before removing duplicates: {rows_before:,}")
ONET = ONET.drop_duplicates().reset_index(drop=True)
rows_after = len(ONET)
print(f"Number of rows after removing duplicates: {rows_after:,}")
print(f"Duplicates removed: {rows_before - rows_after}")

# Print length of dataset
print(f"Number of rows in ONET dataset: {len(ONET):,}")

ONET.head(5)

In [None]:
# Load GPTs are GPTs full label dataset
gpts_full_labels = pd.read_csv(f'{input_data_path}/GPTs-are-GPTs-main/data/full_labelset.tsv', sep="\t")

# Keep relevant columns only
gpts_full_labels = gpts_full_labels[['O*NET-SOC Code', 'Task ID', 'Task', 'Task Type', 'Title', 'gpt4_exposure', 'human_labels']]

# Convert Task ID to integer
gpts_full_labels['Task ID'] = gpts_full_labels['Task ID'].astype(int)

# Remove apostrophes for consistency
gpts_full_labels = gpts_full_labels.applymap(lambda x: x.replace("'", "") if isinstance(x, str) else x)

# Rename columns
gpts_full_labels = gpts_full_labels.rename(columns={
    'Task': 'Task Title',
    'Title': 'Occupation Title'
})

# Print length of dataset
print(f"Number of rows in GPTs full labels dataset: {len(gpts_full_labels):,}")


gpts_full_labels.head(5)

In [None]:
# Merge with ONET dataset to get hierarchical codes and titles
ONET = ONET.merge(gpts_full_labels, on=['O*NET-SOC Code', 'Occupation Title', 'Task ID', 'Task Title', 'Task Type'], how='left')

# Check how many tasks were not matched
unmatched_tasks = ONET[ONET['gpt4_exposure'].isna()]
print(f"Number of unmatched tasks: {len(unmatched_tasks):,}")

In [None]:
ONET

In [None]:
# Merge with Anthropic exposure data
anthropic_exposure = pd.read_csv(f'{input_data_path}/Anthropic_EconomicIndex/automation_vs_augmentation_by_task.csv')

# Remove if all entries are filtered:
anthropic_exposure = anthropic_exposure[anthropic_exposure['filtered'] != 1].reset_index(drop=True)

# Create new columns:
# Sum feedback loop and directive into Automation
# Sum validation, iteration, and learning into Augmentation
anthropic_exposure['automation'] = anthropic_exposure.apply(lambda row: row['feedback_loop'] + row['directive'], axis=1)
anthropic_exposure['augmentation'] = anthropic_exposure.apply(lambda row: row['validation'] + row['task_iteration'] + row['learning'], axis=1)

# Assign labels: take the max of automation, augmentation, manual and assign the corresponding label
def assign_label(row):
    max_value = max(row['automation'], row['augmentation'])
    if max_value == row['automation']:
        return 'Automation'
    elif max_value == row['augmentation']:
        return 'Augmentation'

anthropic_exposure['label'] = anthropic_exposure.apply(assign_label, axis=1)

# Filter to only keep the relevant columns
anthropic_exposure = anthropic_exposure[['task_name', 'automation', 'augmentation', 'label']]
anthropic_exposure

In [None]:
# Print number of unique tasks in ONET dataset
print(f"Number of unique tasks in ONET dataset: {ONET['Task Title'].nunique():,}")

# Print number of unique tasks in Anthropic exposure dataset
print(f"Number of unique tasks in Anthropic exposure dataset: {anthropic_exposure['task_name'].nunique():,}")


# Add normalized task title to ONET for merging
ONET["task_normalized"] = ONET["Task Title"].str.lower().str.strip()


# Merge ONET and Anthropic exposure data on the normalized task title
merged_data = pd.merge(ONET, anthropic_exposure[['task_name', 'automation', 'augmentation', 'label']], left_on="task_normalized", right_on="task_name", how="left")

# Fill the NaN values of the label column in merged dataset as Manual
merged_data['label'] = merged_data['label'].fillna('Manual')

# Print distribution after filling NaN values
print(f"\nDistribution of labels after filling NaN values with 'Manual':")
print(merged_data['label'].value_counts())
print(f"Total tasks: {len(merged_data):,}")

merged_data.head()

In [None]:
# Read task sequence data
# Go through computed_objects/tasks_sequences and read all files and merge with original ONET data
# Get list of files in the directory
task_sequence_files = [f for f in os.listdir(f'{input_data_path}/computed_objects/tasks_sequences') if f.endswith('.csv')]
task_sequence = pd.concat([pd.read_csv(f'{input_data_path}/computed_objects/tasks_sequences/{file}') for file in task_sequence_files], ignore_index=True)

print(f"Task sequence data shape: {task_sequence.shape}")
print(f"Columns in task sequence: {list(task_sequence.columns)}")

# Merge task sequence data with merged_data
merged_data = pd.merge(merged_data, task_sequence[['O*NET-SOC Code', 'Task ID', 'Task Position']], on=['O*NET-SOC Code', 'Task ID'], how='left')
merged_data = merged_data[merged_data['Task Position'].notna()].reset_index(drop=True)
merged_data.head(10)

In [None]:
# # Reshuffle task assignments while preserving SOC hierarchy structure
# # Each occupation unit (O*NET-SOC Code + hierarchy levels) stays intact, but tasks are randomly reassigned
# # COMPLETELY UNRESTRICTED: Each task assigned to random occupation, no constraints on tasks per occupation

# # Create a copy of the data for reshuffling
# reshuffled_data = merged_data.copy()

# # Create SOC hierarchy levels with descriptive names from the O*NET-SOC Code
# reshuffled_data['soc_major_group'] = reshuffled_data['O*NET-SOC Code'].str[:2]
# reshuffled_data['soc_minor_group'] = reshuffled_data['O*NET-SOC Code'].str[:5] 
# reshuffled_data['soc_broad_occupation'] = reshuffled_data['O*NET-SOC Code'].str[:8]
# reshuffled_data['soc_detailed_occupation'] = reshuffled_data['O*NET-SOC Code']

# # Get unique occupation units (with all hierarchy levels intact)
# occupation_units = reshuffled_data[['O*NET-SOC Code', 'Occupation Title', 'soc_major_group', 
#                                    'soc_minor_group', 'soc_broad_occupation', 'soc_detailed_occupation']].drop_duplicates()

# print(f"Number of unique occupation units: {len(occupation_units):,}")

# # Get all unique tasks (each task appears only once)
# unique_tasks = reshuffled_data[['Task ID', 'Task Title', 'Task Type', 'gpt4_exposure', 
#                                'human_labels', 'task_normalized', 'task_name', 
#                                'automation', 'augmentation', 'label']].drop_duplicates()

# print(f"Number of unique tasks: {len(unique_tasks):,}")

# # Set random seed for reproducibility
# np.random.seed(42)

# # UNRESTRICTED ASSIGNMENT: Each task randomly assigned to any occupation unit
# # No constraints on how many tasks per occupation
# n_tasks = len(unique_tasks)
# n_occupations = len(occupation_units)

# # Random assignment: each task gets assigned to a completely random occupation
# random_occupation_indices = np.random.choice(n_occupations, size=n_tasks, replace=True)

# print(f"Assigning {n_tasks:,} tasks randomly across {n_occupations:,} occupation units")

# # Create the reshuffled dataset
# reshuffled_list = []

# for task_idx in range(n_tasks):
#     occ_idx = random_occupation_indices[task_idx]
    
#     # Get task information
#     task_row = unique_tasks.iloc[task_idx].to_dict()
    
#     # Get occupation information  
#     occ_row = occupation_units.iloc[occ_idx].to_dict()
    
#     # Combine them
#     combined_row = {**task_row, **occ_row}
#     reshuffled_list.append(combined_row)

# # Create the reshuffled DataFrame
# reshuffled_data = pd.DataFrame(reshuffled_list)

# print(f"Reshuffled dataset created with {len(reshuffled_data):,} rows")

# # Verify the reshuffling worked correctly
# print(f"\nOriginal dataset task distribution by occupation:")
# original_task_counts = merged_data.groupby('O*NET-SOC Code')['Task ID'].nunique()
# print(f"Min tasks per occupation: {original_task_counts.min()}")
# print(f"Max tasks per occupation: {original_task_counts.max()}")
# print(f"Mean tasks per occupation: {original_task_counts.mean():.2f}")
# print(f"Std tasks per occupation: {original_task_counts.std():.2f}")

# print(f"\nReshuffled dataset task distribution by occupation:")
# reshuffled_task_counts = reshuffled_data.groupby('O*NET-SOC Code')['Task ID'].nunique()
# print(f"Min tasks per occupation: {reshuffled_task_counts.min()}")
# print(f"Max tasks per occupation: {reshuffled_task_counts.max()}")
# print(f"Mean tasks per occupation: {reshuffled_task_counts.mean():.2f}")
# print(f"Std tasks per occupation: {reshuffled_task_counts.std():.2f}")

# # Count occupations with zero tasks
# zero_task_occupations = (reshuffled_task_counts == 0).sum()
# print(f"Occupations with zero tasks: {zero_task_occupations}")

# print(f"\nVerification:")
# print(f"Number of unique occupations preserved: {reshuffled_data['O*NET-SOC Code'].nunique() == merged_data['O*NET-SOC Code'].nunique()}")
# print(f"Number of unique tasks preserved: {reshuffled_data['Task ID'].nunique() == unique_tasks['Task ID'].nunique()}")
# print(f"Each task appears exactly once: {len(reshuffled_data) == len(unique_tasks)}")

# # Show SOC hierarchy column names
# print(f"\nSOC hierarchy columns created:")
# soc_columns = ['soc_major_group', 'soc_minor_group', 'soc_broad_occupation', 'soc_detailed_occupation']
# for col in soc_columns:
#     print(f"  {col}: {reshuffled_data[col].nunique():,} unique values")


# # Set reshuffled data as the final dataset
# merged_data = reshuffled_data.copy()

# # Show sample of reshuffled data
# print(f"\nSample of reshuffled data:")
# reshuffled_data.head()

In [None]:
# Drop the supplemental tasks
merged_data = merged_data[merged_data['Task Type'] != 'Supplemental'].reset_index(drop=True)

# Drop rows whose Occupation Title includes 'Teachers, Postsecondary'
merged_data = merged_data[~merged_data['Occupation Title'].str.contains('Teachers, Postsecondary')].reset_index(drop=True)

In [None]:
# Only keep data for high AI-exposure major groups
high_ai_exposure_major_groups = ['13-0000', '15-0000', '19-0000', '21-0000', '23-0000', '25-0000', '27-0000', '41-0000', '43-0000']
merged_data = merged_data[merged_data['Major_Group_Code'].isin(high_ai_exposure_major_groups)].reset_index(drop=True)

In [None]:
# Check the final merged data structure and explore some sample occupations
print(f"Final merged data shape: {merged_data.shape}")
print(f"Columns: {list(merged_data.columns)}")
print(f"\nLabel distribution:")
print(merged_data['label'].value_counts())

# Find occupations with good task sequences (have multiple tasks with position data)
occupation_task_counts = merged_data.groupby(['O*NET-SOC Code', 'Occupation Title']).size().reset_index(name='task_count')
occupation_task_counts = occupation_task_counts.sort_values('task_count', ascending=False)
print(f"\nTop 10 occupations by task count:")
print(occupation_task_counts.head(10))

# Select a few example occupations for visualization
example_occupations = occupation_task_counts.head(5)['O*NET-SOC Code'].tolist()
print(f"\nSelected example occupations: {example_occupations}")

# Show sample data for first occupation
sample_occ_code = example_occupations[0]
sample_data = merged_data[merged_data['O*NET-SOC Code'] == sample_occ_code].copy()
sample_data = sample_data.sort_values('Task Position')
print(f"\nSample data for {sample_data['Occupation Title'].iloc[0]}:")
print(sample_data[['Task Position', 'Task Title', 'label']].head(10))

In [None]:
import os
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import Rectangle
import numpy as np


def plot_task_sequence(data, occ_code, title_max_length=50):
    """Plot task sequence for a specific occupation with color-coded rectangles"""

    # Filter and sort data for the occupation
    occ_data = data[data['O*NET-SOC Code'] == occ_code].copy()
    occ_data = occ_data.sort_values('Task Position')

    if len(occ_data) == 0 or occ_data['Task Position'].isna().all():
        print(f"No data with positions found for occupation {occ_code}")
        return None

    # Color mapping
    color_map = {
        'Manual': 'lightgray',
        'Augmentation': 'orange',
        'Automation': 'green'
    }

    # Create figure
    fig, ax = plt.subplots(figsize=(16, max(6, len(occ_data) * 0.4)))

    # Plot rectangles for each task
    for i, (idx, row) in enumerate(occ_data.iterrows()):
        y_pos = len(occ_data) - i - 1  # Start from top

        # Create rectangle
        rect = Rectangle((0, y_pos), 10, 0.8,
                        facecolor=color_map.get(row['label'], 'lightgray'),
                        edgecolor='black',
                        linewidth=1)
        ax.add_patch(rect)

        # Truncate task title if too long
        task_title = str(row['Task Title'])
        if len(task_title) > title_max_length:
            task_title = task_title[:title_max_length] + "..."

        # Add task position number on the left
        ax.text(-0.5, y_pos + 0.4, f"{int(row['Task Position'])}",
               ha='right', va='center', fontweight='bold', fontsize=10)

        # Add task title inside rectangle
        ax.text(0.2, y_pos + 0.4, task_title,
               ha='left', va='center', fontsize=9, wrap=True)

        # Add label on the right
        ax.text(10.2, y_pos + 0.4, row['label'],
               ha='left', va='center', fontweight='bold', fontsize=9,
               color=color_map.get(row['label'], 'black'))

    # Set up the plot
    ax.set_xlim(-2, 15)
    ax.set_ylim(-0.5, len(occ_data) - 0.1)

    # Remove axes
    ax.set_xticks([])
    ax.set_yticks([])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)

    # Add title
    occupation_title = occ_data['Occupation Title'].iloc[0]
    plt.title(f"Task Sequence for {occupation_title}\n({occ_code})",
             fontsize=14, fontweight='bold', pad=20)

    # Add legend
    legend_elements = [patches.Patch(color='lightgray', label='Manual'),
                      patches.Patch(color='orange', label='Augmentation'),
                      patches.Patch(color='green', label='Automation')]
    ax.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.tight_layout()
    return fig


# Create output subfolder for all occupation plots
output_folder = os.path.join(output_plot_path, "all_occupation_task_sequences")
os.makedirs(output_folder, exist_ok=True)

# Iterate over all occupations and save plots
occupations = merged_data['O*NET-SOC Code'].unique()
print(f"Found {len(occupations):,} occupations to process. Plots will be saved to: {output_folder}")

summary_rows = []
for i, occ_code in enumerate(sorted(occupations)):
    try:
        occ_data = merged_data[merged_data['O*NET-SOC Code'] == occ_code].copy()
        n_tasks = len(occ_data)
        # Skip if no tasks or no position information
        if n_tasks == 0 or occ_data['Task Position'].isna().all():
            print(f"[{i+1}/{len(occupations)}] Skipping {occ_code}: no positional task data")
            summary_rows.append({
                'O*NET-SOC Code': occ_code,
                'Occupation Title': occ_data['Occupation Title'].iloc[0] if len(occ_data) > 0 else '',
                'n_tasks': n_tasks,
                'manual': 0,
                'augmentation': 0,
                'automation': 0,
                'filename': '',
                'status': 'skipped_no_positions'
            })
            continue

        occ_data = occ_data.sort_values('Task Position')

        fig = plot_task_sequence(merged_data, occ_code)
        if fig is None:
            print(f"[{i+1}/{len(occupations)}] No figure produced for {occ_code}")
            summary_rows.append({
                'O*NET-SOC Code': occ_code,
                'Occupation Title': occ_data['Occupation Title'].iloc[0],
                'n_tasks': n_tasks,
                'manual': occ_data['label'].value_counts().get('Manual', 0),
                'augmentation': occ_data['label'].value_counts().get('Augmentation', 0),
                'automation': occ_data['label'].value_counts().get('Automation', 0),
                'filename': '',
                'status': 'no_figure'
            })
            continue

        # Safe filename
        occ_title = occ_data['Occupation Title'].iloc[0]
        safe_title = ''.join(c if (c.isalnum() or c in (' ', '_', '-')) else '_' for c in occ_title).replace(' ', '_')[:120]
        filename = os.path.join(output_folder, f"task_sequence_{occ_code}_{safe_title}.png")

        # Save and close
        fig.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close(fig)

        counts = occ_data['label'].value_counts()
        summary_rows.append({
            'O*NET-SOC Code': occ_code,
            'Occupation Title': occ_title,
            'n_tasks': n_tasks,
            'manual': counts.get('Manual', 0),
            'augmentation': counts.get('Augmentation', 0),
            'automation': counts.get('Automation', 0),
            'filename': filename,
            'status': 'saved'
        })

        if (i + 1) % 50 == 0:
            print(f"Processed {i+1}/{len(occupations)} occupations")

    except Exception as e:
        print(f"Error processing {occ_code}: {e}")
        summary_rows.append({
            'O*NET-SOC Code': occ_code,
            'Occupation Title': occ_data['Occupation Title'].iloc[0] if 'occ_data' in locals() and len(occ_data) > 0 else '',
            'n_tasks': len(occ_data) if 'occ_data' in locals() else 0,
            'manual': occ_data['label'].value_counts().get('Manual', 0) if 'occ_data' in locals() else 0,
            'augmentation': occ_data['label'].value_counts().get('Augmentation', 0) if 'occ_data' in locals() else 0,
            'automation': occ_data['label'].value_counts().get('Automation', 0) if 'occ_data' in locals() else 0,
            'filename': '',
            'status': 'error',
            'error_msg': str(e)
        })

# # Save summary CSV
# summary_df = pd.DataFrame(summary_rows)
# summary_csv_path = os.path.join(output_folder, 'task_sequence_summary.csv')
# summary_df.to_csv(summary_csv_path, index=False)
# print(f"Saved summary CSV with {len(summary_df)} rows to: {summary_csv_path}")
# print("Done. Open the notebook and run this cell (or execute the notebook) to generate the plots.")


## Task Sequence Visualization Summary

The visualizations above show ordered task sequences for three example occupations, with each task represented as a rectangle and color-coded based on AI exposure classification:

- **Gray rectangles**: Manual tasks (not suitable for AI assistance)
- **Orange rectangles**: Augmentation tasks (AI can assist but human involvement needed)
- **Green rectangles**: Automation tasks (can be fully automated by AI)

### Key Observations:

1. **Computer Systems Engineers/Architects (15-1299.08)**:
   - 28 total tasks with a clear sequence from 1-28
   - Mixed pattern: Many augmentation tasks (11) scattered throughout the sequence
   - Only 1 automation task (task 17: "Develop application-specific software")
   - Early tasks tend to be more manual (communication, analysis), while technical development tasks show more AI potential

2. **Business Continuity Planners (13-1199.04)**:
   - 20 total tasks (note: missing some position numbers, indicating gaps in the sequence)
   - Fewer augmentation opportunities (5) compared to Computer Systems Engineers
   - No automation tasks - this occupation appears less amenable to full AI automation
   - Augmentation tasks cluster in specific areas: regulation interpretation, design/implementation, training, and data analysis

3. **Set and Exhibit Designers (27-1027.00)**:
   - 19 total tasks but all classified as Manual
   - This creative occupation shows very low AI exposure across all tasks
   - Tasks are highly creative, physical, and require human judgment and collaboration
   - Demonstrates occupations where AI has limited applicability in current form

The visualizations reveal that **AI exposure patterns vary significantly by occupation type**, with technical roles showing more potential for AI augmentation and automation compared to creative or planning-focused roles.