#### By: Peyman Shahidi
#### Created: Oct 10, 2025

<br>

In [67]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [68]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [69]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

<br>

### Clean and Merge O*NET Task Data

In [70]:
# (1) Read all relevant datasets
# Task Ratings: O*NET-SOC Code, Task ID, Scale ID, Category, Data Value, N, Standard Error, Lower CI Bound, Upper CI Bound, Recommend Suppress, Not Relevant, Date
task_ratings_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Task Ratings.txt', sep='\t')

# Task Categories: Scale ID, Category, Category Description
task_categories_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Task Categories.txt', sep='\t')

# Tasks to DWAs: O*NET-SOC Code, Task ID, DWA ID
tasks_to_dwas_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Tasks to DWAs.txt', sep='\t')

# DWA Reference: DWA ID, DWA Title
dwa_reference_df = pd.read_csv(f'{input_data_path}/db_27_3_text/DWA Reference.txt', sep='\t')

# Job Zones: O*NET-SOC Code, Job Zone
job_zones_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Job Zones.txt', sep='\t')

# Task Statements: O*NET-SOC Code, Task ID, Task, Task Type, Incumbents Responding
task_statements_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Task Statements.txt', sep='\t')

# Occupation Data: O*NET-SOC Code, Title, Description
occupation_data_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Occupation Data.txt', sep='\t')

print("All datasets loaded successfully!")

All datasets loaded successfully!


In [71]:
# (2) Merge and transform data step by step
# Step 1: Merge task ratings with categories to get category descriptions
ONET = task_ratings_df.merge(task_categories_df, on=['Scale ID', 'Category'], how='left')

# Step 2: Process Category Description - add "FT_" prefix to non-NaN entries, fill NaN with Scale ID
ONET['Category Description'] = ONET['Category Description'].apply(
    lambda x: f'FT_{x}' if pd.notna(x) else x
)
ONET['Category Description'] = ONET['Category Description'].fillna(ONET['Scale ID'])

# Step 3: Replace IM with Importance and RT with Relevance
ONET['Category Description'] = ONET['Category Description'].replace({
    'IM': 'Importance',
    'RT': 'Relevance'
})

# Step 4: Reshape from long to wide format (pivot on Category Description)
ONET = ONET.pivot_table(
    index=['O*NET-SOC Code', 'Task ID'],
    columns='Category Description',
    values='Data Value',
    aggfunc='first'
).reset_index()
ONET.columns.name = None

# Step 5: Merge Occupation Titles
ONET = ONET.merge(
    occupation_data_df[['O*NET-SOC Code', 'Title']],
    on='O*NET-SOC Code',
    how='left'
)
ONET.rename(columns={'Title': 'Occupation Title'}, inplace=True)

# Step 6: Merge Task descriptions and Task Type (rename Task to Task Title)
ONET = ONET.merge(
    task_statements_df[['O*NET-SOC Code', 'Task ID', 'Task', 'Task Type']],
    on=['O*NET-SOC Code', 'Task ID'],
    how='left'
)
ONET.rename(columns={'Task': 'Task Title'}, inplace=True)

# Step 7: Merge DWA IDs
ONET = ONET.merge(
    tasks_to_dwas_df[['O*NET-SOC Code', 'Task ID', 'DWA ID']], 
    on=['O*NET-SOC Code', 'Task ID'], 
    how='left'
)

# Step 8: Merge DWA Titles
ONET = ONET.merge(
    dwa_reference_df[['DWA ID', 'DWA Title']],
    on='DWA ID',
    how='left'
)

# Step 9: Merge Job Zones
ONET = ONET.merge(
    job_zones_df[['O*NET-SOC Code', 'Job Zone']],
    on='O*NET-SOC Code',
    how='left'
)

# Step 10: Reorder columns
# First: O*NET-SOC Code, Occupation Title, Task ID, Task Title, Task Type, DWA ID, DWA Title, Job Zone
# Then: all other columns (Task Type and rating columns)
fixed_cols = ['O*NET-SOC Code', 'Occupation Title', 'Task ID', 'Task Title', 'Task Type', 'DWA ID', 'DWA Title', 'Job Zone']
other_cols = [col for col in ONET.columns if col not in fixed_cols]
ONET = ONET[fixed_cols + other_cols]
ONET.to_csv(f'{output_data_path}/ONET_cleaned_tasks.csv', index=False)
ONET

Unnamed: 0,O*NET-SOC Code,Occupation Title,Task ID,Task Title,Task Type,DWA ID,DWA Title,Job Zone,FT_Daily,FT_Hourly or more,FT_More than monthly,FT_More than weekly,FT_More than yearly,FT_Several times daily,FT_Yearly or less,Importance,Relevance
0,11-1011.00,Chief Executives,8823,Direct or coordinate an organization's financi...,Core,4.A.4.b.4.I09.D02,Direct financial operations.,5,46.67,5.26,11.04,16.19,9.16,7.33,4.34,4.54,94.19
1,11-1011.00,Chief Executives,8824,"Confer with board members, organization offici...",Core,4.A.4.a.2.I03.D14,Confer with organizational members to accompli...,5,25.27,4.81,27.41,15.58,11.14,14.21,1.59,4.15,98.79
2,11-1011.00,Chief Executives,8825,Analyze operations to evaluate performance of ...,Core,4.A.2.a.4.I07.D09,Analyze data to assess operational or project ...,5,35.11,3.73,12.61,18.96,19.04,10.56,0.00,4.40,100.00
3,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",Core,4.A.2.b.1.I09.D01,Implement organizational process or policy cha...,5,38.47,6.38,10.18,23.83,9.27,8.70,3.17,4.39,95.84
4,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",Core,4.A.2.b.4.I01.D01,Develop organizational policies or programs.,5,38.47,6.38,10.18,23.83,9.27,8.70,3.17,4.39,95.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22305,53-7121.00,"Tank Car, Truck, and Ship Loaders",12807,Unload cars containing liquids by connecting h...,Supplemental,4.A.3.a.2.I34.D01,Connect hoses to equipment or machinery.,2,27.65,8.34,6.88,13.95,29.21,7.93,6.05,4.08,64.04
22306,53-7121.00,"Tank Car, Truck, and Ship Loaders",12808,Copy and attach load specifications to loaded ...,Supplemental,4.A.1.b.1.I01.D03,Mark materials or objects for identification.,2,34.11,18.45,1.89,12.36,0.00,32.43,0.75,4.43,60.24
22307,53-7121.00,"Tank Car, Truck, and Ship Loaders",12809,Start pumps and adjust valves or cables to reg...,Core,4.A.3.a.3.I02.D03,Control pumps or pumping equipment.,2,38.33,25.23,0.36,7.31,0.00,28.08,0.70,4.48,73.20
22308,53-7121.00,"Tank Car, Truck, and Ship Loaders",12810,"Perform general warehouse activities, such as ...",Supplemental,4.A.1.b.3.I01.D14,Weigh materials to ensure compliance with spec...,2,34.14,11.78,6.46,14.46,10.85,16.39,5.91,3.53,47.84


In [72]:
# (3) Check for NaN values in each column
nan_counts = ONET.isna().sum()
nan_summary = nan_counts[nan_counts > 0].sort_values(ascending=False)

if len(nan_summary) > 0:
    print("Columns with NaN values:")
    print("=" * 50)
    for col, count in nan_summary.items():
        percentage = (count / len(ONET)) * 100
        print(f"{col}: {count:,} ({percentage:.2f}%)")
    print("=" * 50)
    print(f"Total rows in dataset: {len(ONET):,}")
else:
    print("No NaN values found in any column!")
    print(f"Total rows in dataset: {len(ONET):,}")

Columns with NaN values:
DWA ID: 425 (1.90%)
DWA Title: 425 (1.90%)
Total rows in dataset: 22,310
