#### By: Peyman Shahidi
#### Created: Oct 10, 2025
#### Last Edit: Oct 29, 2025

<br>

In [None]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [None]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [None]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

### O*NET Data Processing

In [None]:
# Read all datasets
task_ratings_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Task Ratings.txt', sep='\t')
task_categories_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Task Categories.txt', sep='\t')
tasks_to_dwas_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Tasks to DWAs.txt', sep='\t')
dwa_reference_df = pd.read_csv(f'{input_data_path}/db_27_3_text/DWA Reference.txt', sep='\t')
job_zones_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Job Zones.txt', sep='\t')
task_statements_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Task Statements.txt', sep='\t')
occupation_data_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Occupation Data.txt', sep='\t')
soc_structure_df = pd.read_csv(f'{input_data_path}/SOC_Structure.csv')

In [None]:
# Merge and transform O*NET task data
ONET = task_ratings_df.merge(task_categories_df, on=['Scale ID', 'Category'], how='left')

# Process Category Description
ONET['Category Description'] = ONET['Category Description'].apply(lambda x: f'FT_{x}' if pd.notna(x) else x)
ONET['Category Description'] = ONET['Category Description'].fillna(ONET['Scale ID'])
ONET['Category Description'] = ONET['Category Description'].replace({'IM': 'Importance', 'RT': 'Relevance'})

# Reshape from long to wide format
ONET = ONET.pivot_table(
    index=['O*NET-SOC Code', 'Task ID'],
    columns='Category Description',
    values='Data Value',
    aggfunc='first'
).reset_index()
ONET.columns.name = None

# Merge essential data (but NOT DWA data yet - save that for after task time calculations)
ONET = ONET.merge(occupation_data_df[['O*NET-SOC Code', 'Title']], on='O*NET-SOC Code', how='left')
ONET.rename(columns={'Title': 'Occupation Title'}, inplace=True)

ONET = ONET.merge(task_statements_df[['O*NET-SOC Code', 'Task ID', 'Task', 'Task Type']], on=['O*NET-SOC Code', 'Task ID'], how='left')
ONET.rename(columns={'Task': 'Task Title'}, inplace=True)

ONET = ONET.merge(job_zones_df[['O*NET-SOC Code', 'Job Zone']], on='O*NET-SOC Code', how='left')

print(f"Base dataset created with {len(ONET):,} rows before task time calculations")

Base dataset created with 17,953 rows before task time calculations


### Filter Occupations Containing "All Other" and "Teachers"

In [None]:
# # Remove rows where occupation title contains "All Other"
# print(f"Number of rows before removing 'All Other': {ONET.shape[0]:,}")
# ONET = ONET[~ONET["Occupation Title"].str.contains("All Other", case=False, na=False)]
# print(f"Number of rows after removing 'All Other': {ONET.shape[0]:,}")

In [None]:
# # Filter and remove "Teachers"-related occupations
# contains_teacher = ONET[ONET['Occupation Title'].str.contains('Teachers', case=False, na=False)]

# unique_teacher_occupations = contains_teacher['Occupation Title'].nunique()
# print(f'Number of unique occupations containing the word "Teachers": {unique_teacher_occupations}')

# # Remove rows that contain "Teacher" (case-insensitive)
# ONET = ONET[~ONET['Occupation Title'].str.contains('Teachers', case=False, na=False)].reset_index(drop=True)
# print(f"Rows after removing Teachers: {len(ONET):,}")

### Task Time Measurement Calculation
**IMPORTANT**: Calculate task time measures BEFORE merging DWA data to avoid duplication issues

In [None]:
# Task Time Measurement Creation
# ==============================

# Step 1: Define frequency mappings (annual occurrence rates)
frequency_mapping = {
    'FT_Several times daily': 1000,      # ~4 times/day × 250 work days
    'FT_Hourly or more': 2000,           # Multiple times per hour
    'FT_Daily': 250,                      # Once per day
    'FT_More than weekly': 156,          # ~3 times/week × 52 weeks
    'FT_More than monthly': 24,          # ~2 times/month × 12 months
    'FT_More than yearly': 6,            # A few times per year
    'FT_Yearly or less': 1               # Once a year
}

# Step 2: Calculate weighted frequency using percentage distributions
def calculate_weighted_frequency(row):
    """Calculate weighted annual frequency based on percentage distributions."""
    total_weighted_freq = 0.0
    
    for freq_col, freq_value in frequency_mapping.items():
        if freq_col in row.index and pd.notna(row[freq_col]):
            percentage = row[freq_col] / 100.0  # Convert percentage to decimal
            total_weighted_freq += freq_value * percentage
    
    return total_weighted_freq if total_weighted_freq > 0 else np.nan

ONET['Weighted_Frequency_Annual'] = ONET.apply(calculate_weighted_frequency, axis=1)

# Step 3: Calculate raw task score (frequency × importance × relevance)
ONET['Raw_Task_Score'] = (ONET['Weighted_Frequency_Annual'] * 
                         ONET['Importance'] * 
                         ONET['Relevance'])

# Step 4: Normalize within each occupation (CRITICAL STEP)
# Each occupation represents a full-time job (40 hours/week)
# Task proportions within each occupation must sum to 1.0 (100% of work time)
ONET['Total_Raw_Score_by_Occupation'] = ONET.groupby('O*NET-SOC Code')['Raw_Task_Score'].transform('sum')
ONET['Task_Time_Proportion'] = ONET['Raw_Task_Score'] / ONET['Total_Raw_Score_by_Occupation']
ONET['Task_Time_Percentage'] = ONET['Task_Time_Proportion'] * 100

# # Step 5: Convert to time estimates 
# # Full-time job = 40 hours/week = 2000 hours/year (50 weeks)
# ONET['Hours_Per_Week'] = ONET['Task_Time_Proportion'] * 40  # Convert proportion to hours per week
# ONET['Estimated_Annual_Hours'] = ONET['Task_Time_Proportion'] * 2000  # Annual hours
# ONET['Hours_Per_Occurrence'] = np.where(
#     ONET['Weighted_Frequency_Annual'] > 0,
#     ONET['Estimated_Annual_Hours'] / ONET['Weighted_Frequency_Annual'],
#     np.nan
# )

# Step 6: Drop "Raw_Task_Score", "Total_Raw_Score_by_Occupation", "Task_Time_Proportion"
ONET = ONET.drop(columns=["Raw_Task_Score", "Total_Raw_Score_by_Occupation", "Task_Time_Proportion", "Weighted_Frequency_Annual"])

print("Task Time Measurement Complete!")
print(f"✓ {len(ONET):,} tasks processed across {ONET['O*NET-SOC Code'].nunique():,} occupations")
print(f"✓ {ONET['Task_Time_Percentage'].gt(0).sum():,} tasks have valid time measures")

Task Time Measurement Complete!
✓ 17,953 tasks processed across 873 occupations
✓ 17,953 tasks have valid time measures


### Add SOC Industry Levels Data

In [None]:
# Create SOC Code-to-Label mapping
code_label_rows = []
for idx, row in soc_structure_df.iterrows():
    if pd.notna(row['Major Group']):
        code = row['Major Group']
    elif pd.notna(row['Minor Group']):
        code = row['Minor Group']
    elif pd.notna(row['Broad Occupation']):
        code = row['Broad Occupation']
    elif pd.notna(row['Detailed Occupation']):
        code = row['Detailed Occupation']
    elif pd.notna(row['Detailed O*NET-SOC']):
        code = row['Detailed O*NET-SOC']
    else:
        continue
    code_label_rows.append({'Code': code, 'Label': row['SOC or O*NET-SOC 2019 Title']})

soc_code_label = pd.DataFrame(code_label_rows)
soc_code_label.to_csv(f'{output_data_path}/SOC_Code_Label_Mapping.csv', index=False)

In [None]:
# Create industry codes at different hierarchy levels
ONET['SOC_Code_7digit'] = ONET['O*NET-SOC Code'].str.split('.').str[0]

industry_levels = {
    2: 'Major_Group',
    5: 'Minor_Group', 
    6: 'Broad_Occupation',
    7: 'Detailed_Occupation'
}

for num_digits, level_name in industry_levels.items():
    if num_digits == 2:
        ONET[f'{level_name}_Code'] = ONET['SOC_Code_7digit'].str[:2] + '-0000'
    elif num_digits == 5:
        ONET[f'{level_name}_Code'] = ONET['SOC_Code_7digit'].str[:5] + '00'
    elif num_digits == 6:
        ONET[f'{level_name}_Code'] = ONET['SOC_Code_7digit'].str[:6] + '0'
    else:
        ONET[f'{level_name}_Code'] = ONET['SOC_Code_7digit']

# Drop SOC_Code_7digit from columns
ONET = ONET.drop(columns=['SOC_Code_7digit'])

In [None]:
# Add title labels for each industry level
for num_digits, level_name in industry_levels.items():
    code_col = f'{level_name}_Code'
    label_col = f'{level_name}_Title'
    ONET = ONET.merge(
        soc_code_label.rename(columns={'Code': code_col, 'Label': label_col}),
        on=code_col,
        how='left'
    )


# Aggregate unique task and DWA counts at each hierarchical level
def aggregate_by_level(df, code_col, title_col, level_name):
    """
    Aggregate unique task and DWA counts for a given hierarchical level.
    
    Parameters:
    - df: DataFrame to aggregate
    - code_col: Column name for the code/ID
    - title_col: Column name for the title/description
    - level_name: Name of the hierarchical level (e.g., 'Major Group')
    
    Returns:
    - DataFrame with columns: Level, Code, Title, num_unique_tasks, num_unique_dwas
    """
    agg = (
        df.groupby([code_col, title_col])
        .agg(
            num_unique_tasks=('Task ID', 'nunique'),
            num_unique_dwas=('DWA ID', 'nunique') if 'DWA ID' in df.columns else ('Task ID', lambda x: 0)
        )
        .reset_index()
        .rename(columns={code_col: 'Code', title_col: 'Title'})
    )
    agg['Level'] = level_name
    return agg[['Level', 'Code', 'Title', 'num_unique_tasks', 'num_unique_dwas']]


# Create aggregations for each hierarchical level
major_group_agg = aggregate_by_level(ONET, 'Major_Group_Code', 'Major_Group_Title', 'Major Group')
minor_group_agg = aggregate_by_level(ONET, 'Minor_Group_Code', 'Minor_Group_Title', 'Minor Group')
broad_occ_agg = aggregate_by_level(ONET, 'Broad_Occupation_Code', 'Broad_Occupation_Title', 'Broad Occupation')
detailed_occ_agg = aggregate_by_level(ONET, 'Detailed_Occupation_Code', 'Detailed_Occupation_Title', 'Detailed Occupation')
occupation_agg = aggregate_by_level(ONET, 'O*NET-SOC Code', 'Occupation Title', 'Occupation')

# Combine all levels into one dataset
hierarchical_task_counts = pd.concat([
    major_group_agg,
    minor_group_agg,
    broad_occ_agg,
    detailed_occ_agg,
    occupation_agg
], ignore_index=True)

# Save the combined dataset
hierarchical_task_counts.to_csv(f'{output_data_path}/hierarchical_task_counts.csv', index=False)

print(f"\nHierarchical Task Counts Summary:")
print(f"  Total rows: {len(hierarchical_task_counts):,}")
print(f"\nBreakdown by aggregation level:")
for level in ['Major Group', 'Minor Group', 'Broad Occupation', 'Detailed Occupation', 'Occupation']:
    count = len(hierarchical_task_counts[hierarchical_task_counts['Level'] == level])
    print(f"  {level}: {count}")


Hierarchical Task Counts Summary:
  Total rows: 2,179

Breakdown by aggregation level:
  Major Group: 22
  Minor Group: 95
  Broad Occupation: 430
  Detailed Occupation: 759
  Occupation: 873


### Merge DWA Data
Now that task time measures are calculated, we can safely merge DWA data

In [None]:
# Merge DWA (Detailed Work Activities) data
# This is done AFTER task time calculations to avoid duplication issues
print(f"Dataset shape before DWA merge: {ONET.shape}")

# Merge DWA mappings and titles
ONET = ONET.merge(tasks_to_dwas_df[['O*NET-SOC Code', 'Task ID', 'DWA ID']], on=['O*NET-SOC Code', 'Task ID'], how='left')
ONET = ONET.merge(dwa_reference_df[['DWA ID', 'DWA Title']], on='DWA ID', how='left')

print(f"Dataset shape after DWA merge: {ONET.shape}")

# Check for duplicates with DWA ID and DWA Title
dup_cols_full = ['O*NET-SOC Code', 'Task ID', 'Task Type', 'DWA ID', 'DWA Title']
num_duplicates_full = ONET.duplicated(subset=dup_cols_full).sum()
print(f"Number of duplicate rows (with DWA ID & Title): {num_duplicates_full}")

# Check for duplicates without DWA ID and DWA Title
dup_cols_task = ['O*NET-SOC Code', 'Task ID', 'Task Type']
num_duplicates_task = ONET.duplicated(subset=dup_cols_task).sum()
print(f"Number of duplicate rows (without DWA ID & Title): {num_duplicates_task}")

# Show how many tasks are matched to multiple DWAs
task_counts = ONET.groupby(dup_cols_task)['DWA ID'].nunique()
multi_dwa_tasks = (task_counts > 1).sum()
print(f"Number of tasks matched to multiple DWAs: {multi_dwa_tasks}")

# Remove duplicates (keep first occurrence)
if num_duplicates_full > 0:
    ONET = ONET.drop_duplicates(subset=dup_cols_full).reset_index(drop=True)
    print(f"Removed {num_duplicates_full} duplicate rows")

Dataset shape before DWA merge: (17953, 24)
Dataset shape after DWA merge: (22310, 26)
Number of duplicate rows (with DWA ID & Title): 0
Number of duplicate rows (without DWA ID & Title): 4357
Number of tasks matched to multiple DWAs: 3780


In [None]:
# Create DWA repetition dataset across hierarchical levels
dwa_repetition_data = []

# Filter out rows where DWA ID is null
onet_with_dwa = ONET[ONET['DWA ID'].notna()]

for dwa_id in onet_with_dwa['DWA ID'].unique():
    dwa_data = onet_with_dwa[onet_with_dwa['DWA ID'] == dwa_id]
    dwa_title = dwa_data['DWA Title'].iloc[0]
    
    dwa_repetition_data.append({
        'DWA ID': dwa_id,
        'DWA Title': dwa_title,
        'num_occupations': dwa_data['O*NET-SOC Code'].nunique(),
        'num_detailed_occupations': dwa_data['Detailed_Occupation_Code'].nunique(),
        'num_broad_occupations': dwa_data['Broad_Occupation_Code'].nunique(),
        'num_minor_groups': dwa_data['Minor_Group_Code'].nunique(),
        'num_major_groups': dwa_data['Major_Group_Code'].nunique()
    })

dwa_repetition_df = pd.DataFrame(dwa_repetition_data)
dwa_repetition_df.to_csv(f'{output_data_path}/dwa_repetition_by_hierarchy.csv', index=False)

In [None]:
# # Create task repetition dataset across hierarchical levels
# task_repetition_data = []

# for task_id in ONET['Task ID'].unique():
#     task_data = ONET[ONET['Task ID'] == task_id]
    
#     # Get task title (should be consistent for same task ID)
#     task_title = task_data['Task Title'].iloc[0]
    
#     # Count occurrences at each hierarchical level
#     task_repetition_data.append({
#         'Task ID': task_id,
#         'Task Title': task_title,
#         'num_occupations': task_data['O*NET-SOC Code'].nunique(),
#         'num_detailed_occupations': task_data['Detailed_Occupation_Code'].nunique(),
#         'num_broad_occupations': task_data['Broad_Occupation_Code'].nunique(),
#         'num_minor_groups': task_data['Minor_Group_Code'].nunique(),
#         'num_major_groups': task_data['Major_Group_Code'].nunique()
#     })

# task_repetition_df = pd.DataFrame(task_repetition_data)

# # Save task repetition dataset
# task_repetition_df.to_csv(f'{output_data_path}/task_repetition_by_hierarchy.csv', index=False)

# print("Task Repetition Analysis:")
# print(f"  Total unique tasks: {len(task_repetition_df):,}")
# print(f"\nTasks appearing in multiple occupations:")
# print(f"  Tasks in 2+ occupations: {(task_repetition_df['num_occupations'] >= 2).sum():,}")
# print(f"  Tasks in 5+ occupations: {(task_repetition_df['num_occupations'] >= 5).sum():,}")
# print(f"  Tasks in 10+ occupations: {(task_repetition_df['num_occupations'] >= 10).sum():,}")
# print(f"\nMax repetition across levels:")
# print(f"  Max occupations per task: {task_repetition_df['num_occupations'].max()}")
# print(f"  Max detailed occupations per task: {task_repetition_df['num_detailed_occupations'].max()}")
# print(f"  Max broad occupations per task: {task_repetition_df['num_broad_occupations'].max()}")
# print(f"  Max minor groups per task: {task_repetition_df['num_minor_groups'].max()}")
# print(f"  Max major groups per task: {task_repetition_df['num_major_groups'].max()}")
# print(f"\nSaved to: {output_data_path}/task_repetition_by_hierarchy.csv")

# task_repetition_df.head(10)

# # Tasks are unique! ==> All 1s

### Add BLS Wage Data
Load and merge Bureau of Labor Statistics wage data

In [None]:
# Read and clean BLS wage data
print("Reading BLS wage data...")
bls_wage_df = pd.read_excel(f"{main_folder_path}/data/oesm23nat/national_M2023_dl.xlsx")

# Define all wage columns to process (both hourly and annual)
hourly_wage_cols = ['H_MEAN', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90']
annual_wage_cols = ['A_MEAN', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90']
all_wage_cols = hourly_wage_cols + annual_wage_cols

# Clean all wage columns - convert non-numeric values to NaN
for col in all_wage_cols:
    bls_wage_df[col] = pd.to_numeric(bls_wage_df[col], errors='coerce')

# Fill missing hourly wage values using annual wage (convert using 2080 hours/year)
for h_col, a_col in zip(hourly_wage_cols, annual_wage_cols):
    missing_hourly = bls_wage_df[h_col].isna() & bls_wage_df[a_col].notna()
    bls_wage_df.loc[missing_hourly, h_col] = bls_wage_df.loc[missing_hourly, a_col] / 2080

# Create mapping for column renaming
wage_column_mapping = {
    'H_MEAN': 'Hourly_Mean_Wage',
    'H_PCT10': 'Hourly_P10_Wage',
    'H_PCT25': 'Hourly_P25_Wage', 
    'H_MEDIAN': 'Hourly_Median_Wage',
    'H_PCT75': 'Hourly_P75_Wage',
    'H_PCT90': 'Hourly_P90_Wage',
    'A_MEAN': 'Annual_Mean_Wage',
    'A_PCT10': 'Annual_P10_Wage',
    'A_PCT25': 'Annual_P25_Wage',
    'A_MEDIAN': 'Annual_Median_Wage', 
    'A_PCT75': 'Annual_P75_Wage',
    'A_PCT90': 'Annual_P90_Wage'
}

# Merge with ONET dataset - include all wage percentiles
ONET['Base_SOC_Code'] = ONET['O*NET-SOC Code'].str.split('.').str[0]
wage_data = bls_wage_df[['OCC_CODE'] + all_wage_cols].rename(columns={'OCC_CODE': 'Base_SOC_Code'})
wage_data = wage_data.rename(columns=wage_column_mapping)

ONET = ONET.merge(wage_data, on='Base_SOC_Code', how='left')

# Report results
print("✓ Wage data merged successfully")
print(f"✓ Final dataset shape: {ONET.shape}")
print(f"✓ Added {len(wage_column_mapping)} wage variables:")
for old_col, new_col in wage_column_mapping.items():
    print(f"  {old_col} → {new_col}")
print(f"✓ Annual wage data was used to fill missing hourly wages when available")

Reading BLS wage data...
✓ Wage data merged successfully
✓ Final dataset shape: (22310, 39)
✓ Added 12 wage variables:
  H_MEAN → Hourly_Mean_Wage
  H_PCT10 → Hourly_P10_Wage
  H_PCT25 → Hourly_P25_Wage
  H_MEDIAN → Hourly_Median_Wage
  H_PCT75 → Hourly_P75_Wage
  H_PCT90 → Hourly_P90_Wage
  A_MEAN → Annual_Mean_Wage
  A_PCT10 → Annual_P10_Wage
  A_PCT25 → Annual_P25_Wage
  A_MEDIAN → Annual_Median_Wage
  A_PCT75 → Annual_P75_Wage
  A_PCT90 → Annual_P90_Wage
✓ Annual wage data was used to fill missing hourly wages when available


In [None]:
# Check available wage columns in BLS data
print("BLS Wage Data Columns:")
wage_columns = [col for col in bls_wage_df.columns if any(x in col.upper() for x in ['WAGE', 'H_', 'A_', 'PCT'])]
print(wage_columns)

print(f"\nBLS dataset shape: {bls_wage_df.shape}")
print(f"Total columns: {len(bls_wage_df.columns)}")

# Show first few rows of wage-related columns
print("\nSample of wage data:")
bls_wage_df[wage_columns].head(10)

BLS Wage Data Columns:
['AREA_TITLE', 'AREA_TYPE', 'PCT_TOTAL', 'PCT_RPT', 'H_MEAN', 'A_MEAN', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90']

BLS dataset shape: (1403, 32)
Total columns: 32

Sample of wage data:


Unnamed: 0,AREA_TITLE,AREA_TYPE,PCT_TOTAL,PCT_RPT,H_MEAN,A_MEAN,H_PCT10,H_PCT25,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90
0,U.S.,1,,,31.48,65470.0,13.97,17.14,23.11,37.01,58.4,29050.0,35660.0,48060.0,76980.0,121470.0
1,U.S.,1,,,66.23,137750.0,26.23,37.66,56.19,81.29,111.36,54550.0,78330.0,116880.0,169090.0,231620.0
2,U.S.,1,,,65.43,136100.0,22.31,31.81,49.74,79.57,,46400.0,66170.0,103460.0,165500.0,
3,U.S.,1,,,124.47,258900.0,38.46,62.9,99.37,,,80000.0,130840.0,206680.0,,
4,U.S.,1,,,124.47,258900.0,38.46,62.9,99.37,,,80000.0,130840.0,206680.0,,
5,U.S.,1,,,62.18,129330.0,22.28,31.34,48.69,77.06,111.59,46340.0,65180.0,101280.0,160290.0,232110.0
6,U.S.,1,,,62.18,129330.0,22.28,31.34,48.69,77.06,111.59,46340.0,65180.0,101280.0,160290.0,232110.0
7,U.S.,1,,,32.76,68140.0,10.1,13.85,22.74,39.52,62.26,21010.0,28810.0,47290.0,82200.0,129510.0
8,U.S.,1,,,32.76,68140.0,10.1,13.85,22.74,39.52,62.26,21010.0,28810.0,47290.0,82200.0,129510.0
9,U.S.,1,,,76.9,159960.0,33.26,47.68,67.23,96.15,,69170.0,99180.0,139850.0,199990.0,


In [None]:
# Clean up duplicate wage columns
print("Cleaning up duplicate wage columns...")

# Handle the duplicate Hourly_Mean_Wage columns
if 'Hourly_Mean_Wage_x' in ONET.columns and 'Hourly_Mean_Wage_y' in ONET.columns:
    # Use the new one (_y) and drop the old one (_x)
    ONET['Hourly_Mean_Wage'] = ONET['Hourly_Mean_Wage_y']
    ONET = ONET.drop(columns=['Hourly_Mean_Wage_x', 'Hourly_Mean_Wage_y'])
    print("✓ Merged duplicate Hourly_Mean_Wage columns")

# Define final wage columns in proper order
wage_cols = ['Hourly_Mean_Wage', 'Hourly_P10_Wage', 'Hourly_P25_Wage', 'Hourly_Median_Wage', 
             'Hourly_P75_Wage', 'Hourly_P90_Wage', 'Annual_Mean_Wage', 'Annual_P10_Wage', 
             'Annual_P25_Wage', 'Annual_Median_Wage', 'Annual_P75_Wage', 'Annual_P90_Wage']

# Filter for existing wage columns
existing_wage_cols = [col for col in wage_cols if col in ONET.columns]

# Reorder columns properly
first_cols = ['O*NET-SOC Code', 'Occupation Title', 'Task ID', 'Task Title', 'Task Type',
              'DWA ID', 'DWA Title', 'Job Zone', 'Task_Time_Percentage']

last_cols = ['Major_Group_Code', 'Major_Group_Title', 'Minor_Group_Code', 'Minor_Group_Title', 
             'Broad_Occupation_Code', 'Broad_Occupation_Title', 'Detailed_Occupation_Code', 'Detailed_Occupation_Title']

# Filter for existing columns only
existing_first_cols = [col for col in first_cols if col in ONET.columns]
existing_last_cols = [col for col in last_cols if col in ONET.columns]

middle_cols = [col for col in ONET.columns if col not in existing_first_cols + existing_wage_cols + existing_last_cols]

ONET = ONET[existing_first_cols + existing_wage_cols + middle_cols + existing_last_cols]

# Save final dataset with all wage percentiles
ONET.to_csv(f'{output_data_path}/ONET_cleaned_tasks.csv', index=False)

print(f"✓ Final dataset shape: {ONET.shape}")
print(f"✓ Wage columns included: {len(existing_wage_cols)}")
print(f"✓ Saved to: {output_data_path}/ONET_cleaned_tasks.csv")

# Show the wage columns we have
print(f"\nFinal wage columns in dataset:")
for i, col in enumerate(existing_wage_cols, 1):
    print(f"  {i}. {col}")

Cleaning up duplicate wage columns...
✓ Final dataset shape: (22310, 39)
✓ Wage columns included: 12
✓ Saved to: ../data/computed_objects/ONET_cleaned_tasks.csv

Final wage columns in dataset:
  1. Hourly_Mean_Wage
  2. Hourly_P10_Wage
  3. Hourly_P25_Wage
  4. Hourly_Median_Wage
  5. Hourly_P75_Wage
  6. Hourly_P90_Wage
  7. Annual_Mean_Wage
  8. Annual_P10_Wage
  9. Annual_P25_Wage
  10. Annual_Median_Wage
  11. Annual_P75_Wage
  12. Annual_P90_Wage


In [None]:
# Sanity check: NaN values
nan_counts = ONET.isna().sum()
nan_summary = nan_counts[nan_counts > 0].sort_values(ascending=False)

print(f"Dataset shape: {ONET.shape}")
print(f"Total rows: {len(ONET):,}\n")

if len(nan_summary) > 0:
    print("Columns with NaN values:")
    print("=" * 50)
    for col, count in nan_summary.items():
        percentage = (count / len(ONET)) * 100
        print(f"{col}: {count:,} ({percentage:.2f}%)")
else:
    print("✓ No NaN values found in any column!")

# Final verification of all wage data
wage_columns = ['Hourly_Mean_Wage', 'Hourly_P10_Wage', 'Hourly_P25_Wage', 'Hourly_Median_Wage', 
                'Hourly_P75_Wage', 'Hourly_P90_Wage', 'Annual_Mean_Wage', 'Annual_P10_Wage', 
                'Annual_P25_Wage', 'Annual_Median_Wage', 'Annual_P75_Wage', 'Annual_P90_Wage']

print(f"\nWage Data Coverage Summary:")
print("=" * 50)
total_tasks = len(ONET)

for wage_col in wage_columns:
    if wage_col in ONET.columns:
        coverage = ONET[wage_col].notna().sum()
        percentage = coverage / total_tasks * 100
        print(f"{wage_col}: {coverage:,} ({percentage:.2f}%)")

# Summary stats for key wage columns
print(f"\nKey Wage Statistics:")
print("=" * 30)
key_cols = ['Hourly_Mean_Wage', 'Hourly_Median_Wage', 'Annual_Mean_Wage', 'Annual_Median_Wage']
for col in key_cols:
    if col in ONET.columns and ONET[col].notna().sum() > 0:
        mean_val = ONET[col].mean()
        median_val = ONET[col].median()
        min_val = ONET[col].min()
        max_val = ONET[col].max()
        print(f"{col}:")
        print(f"  Mean: ${mean_val:.2f}, Median: ${median_val:.2f}")
        print(f"  Range: ${min_val:.2f} - ${max_val:.2f}")
        print()

Dataset shape: (22310, 39)
Total rows: 22,310

Columns with NaN values:
Minor_Group_Title: 4,231 (18.96%)
Annual_P90_Wage: 1,658 (7.43%)
Hourly_P90_Wage: 1,594 (7.14%)
Annual_P75_Wage: 1,138 (5.10%)
Hourly_P75_Wage: 1,074 (4.81%)
Annual_Median_Wage: 794 (3.56%)
Hourly_Median_Wage: 730 (3.27%)
Annual_Mean_Wage: 593 (2.66%)
Annual_P10_Wage: 593 (2.66%)
Annual_P25_Wage: 593 (2.66%)
Hourly_Mean_Wage: 529 (2.37%)
Hourly_P10_Wage: 529 (2.37%)
Hourly_P25_Wage: 529 (2.37%)
DWA ID: 425 (1.90%)
DWA Title: 425 (1.90%)
Broad_Occupation_Title: 215 (0.96%)

Wage Data Coverage Summary:
Hourly_Mean_Wage: 21,781 (97.63%)
Hourly_P10_Wage: 21,781 (97.63%)
Hourly_P25_Wage: 21,781 (97.63%)
Hourly_Median_Wage: 21,580 (96.73%)
Hourly_P75_Wage: 21,236 (95.19%)
Hourly_P90_Wage: 20,716 (92.86%)
Annual_Mean_Wage: 21,717 (97.34%)
Annual_P10_Wage: 21,717 (97.34%)
Annual_P25_Wage: 21,717 (97.34%)
Annual_Median_Wage: 21,516 (96.44%)
Annual_P75_Wage: 21,172 (94.90%)
Annual_P90_Wage: 20,652 (92.57%)

Key Wage Statisti