#### By: Peyman Shahidi
#### Created: Oct 10, 2025

<br>

In [1]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [2]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [3]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

### O*NET Data Processing

In [4]:
# Read all datasets
task_ratings_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Task Ratings.txt', sep='\t')
task_categories_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Task Categories.txt', sep='\t')
tasks_to_dwas_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Tasks to DWAs.txt', sep='\t')
dwa_reference_df = pd.read_csv(f'{input_data_path}/db_27_3_text/DWA Reference.txt', sep='\t')
job_zones_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Job Zones.txt', sep='\t')
task_statements_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Task Statements.txt', sep='\t')
occupation_data_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Occupation Data.txt', sep='\t')
soc_structure_df = pd.read_csv(f'{input_data_path}/SOC_Structure.csv')

In [5]:
# Merge and transform O*NET task data
ONET = task_ratings_df.merge(task_categories_df, on=['Scale ID', 'Category'], how='left')

# Process Category Description
ONET['Category Description'] = ONET['Category Description'].apply(lambda x: f'FT_{x}' if pd.notna(x) else x)
ONET['Category Description'] = ONET['Category Description'].fillna(ONET['Scale ID'])
ONET['Category Description'] = ONET['Category Description'].replace({'IM': 'Importance', 'RT': 'Relevance'})

# Reshape from long to wide format
ONET = ONET.pivot_table(
    index=['O*NET-SOC Code', 'Task ID'],
    columns='Category Description',
    values='Data Value',
    aggfunc='first'
).reset_index()
ONET.columns.name = None

# Merge additional data
ONET = ONET.merge(occupation_data_df[['O*NET-SOC Code', 'Title']], on='O*NET-SOC Code', how='left')
ONET.rename(columns={'Title': 'Occupation Title'}, inplace=True)

ONET = ONET.merge(task_statements_df[['O*NET-SOC Code', 'Task ID', 'Task', 'Task Type']], on=['O*NET-SOC Code', 'Task ID'], how='left')
ONET.rename(columns={'Task': 'Task Title'}, inplace=True)

ONET = ONET.merge(tasks_to_dwas_df[['O*NET-SOC Code', 'Task ID', 'DWA ID']], on=['O*NET-SOC Code', 'Task ID'], how='left')
ONET = ONET.merge(dwa_reference_df[['DWA ID', 'DWA Title']], on='DWA ID', how='left')
ONET = ONET.merge(job_zones_df[['O*NET-SOC Code', 'Job Zone']], on='O*NET-SOC Code', how='left')

# Reorder columns
fixed_cols = ['O*NET-SOC Code', 'Occupation Title', 'Task ID', 'Task Title', 'Task Type', 'DWA ID', 'DWA Title', 'Job Zone']
other_cols = [col for col in ONET.columns if col not in fixed_cols]
ONET = ONET[fixed_cols + other_cols]

### SOC Industry Structure

In [6]:
# Create SOC Code-to-Label mapping
code_label_rows = []
for idx, row in soc_structure_df.iterrows():
    if pd.notna(row['Major Group']):
        code = row['Major Group']
    elif pd.notna(row['Minor Group']):
        code = row['Minor Group']
    elif pd.notna(row['Broad Occupation']):
        code = row['Broad Occupation']
    elif pd.notna(row['Detailed Occupation']):
        code = row['Detailed Occupation']
    elif pd.notna(row['Detailed O*NET-SOC']):
        code = row['Detailed O*NET-SOC']
    else:
        continue
    code_label_rows.append({'Code': code, 'Label': row['SOC or O*NET-SOC 2019 Title']})

soc_code_label = pd.DataFrame(code_label_rows)
soc_code_label.to_csv(f'{output_data_path}/SOC_Code_Label_Mapping.csv', index=False)

In [7]:
# Create industry codes at different hierarchy levels
ONET['SOC_Code_7digit'] = ONET['O*NET-SOC Code'].str.split('.').str[0]

industry_levels = {
    2: 'Major_Group',
    5: 'Minor_Group', 
    6: 'Broad_Occupation',
    7: 'Detailed_Occupation'
}

for num_digits, level_name in industry_levels.items():
    if num_digits == 2:
        ONET[f'{level_name}_Code'] = ONET['SOC_Code_7digit'].str[:2] + '-0000'
    elif num_digits == 5:
        ONET[f'{level_name}_Code'] = ONET['SOC_Code_7digit'].str[:5] + '00'
    elif num_digits == 6:
        ONET[f'{level_name}_Code'] = ONET['SOC_Code_7digit'].str[:6] + '0'
    else:
        ONET[f'{level_name}_Code'] = ONET['SOC_Code_7digit']

In [8]:
# Add title labels for each industry level
for num_digits, level_name in industry_levels.items():
    code_col = f'{level_name}_Code'
    label_col = f'{level_name}_Title'
    ONET = ONET.merge(
        soc_code_label.rename(columns={'Code': code_col, 'Label': label_col}),
        on=code_col,
        how='left'
    )

# Save final dataset
ONET.to_csv(f'{output_data_path}/ONET_cleaned_tasks.csv', index=False)

In [9]:
# Sanity check: NaN values
nan_counts = ONET.isna().sum()
nan_summary = nan_counts[nan_counts > 0].sort_values(ascending=False)

print(f"Dataset shape: {ONET.shape}")
print(f"Total rows: {len(ONET):,}\n")

if len(nan_summary) > 0:
    print("Columns with NaN values:")
    print("=" * 50)
    for col, count in nan_summary.items():
        percentage = (count / len(ONET)) * 100
        print(f"{col}: {count:,} ({percentage:.2f}%)")
else:
    print("✓ No NaN values found in any column!")

Dataset shape: (22310, 26)
Total rows: 22,310

Columns with NaN values:
Minor_Group_Title: 4,231 (18.96%)
DWA ID: 425 (1.90%)
DWA Title: 425 (1.90%)
Broad_Occupation_Title: 215 (0.96%)
