#### By: Peyman Shahidi
#### Created: Oct 10, 2025

<br>

In [63]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [64]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [65]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

<br>

### Convert Task Frequency, Importance, and Relevance Dataset from Long to Wide

In [66]:
# Read Task Ratings file
task_ratings_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Task Ratings.txt', sep='\t')

# Read Task Categories file
task_categories_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Task Categories.txt', sep='\t')

In [67]:

# Merge task ratings with categories to get category numbers
task_ratings_df = task_ratings_df.merge(task_categories_df, on=['Scale ID', 'Category'], how='left')

# Add "FT_" prefix to non-NaN entries in Category Description
task_ratings_df['Category Description'] = task_ratings_df['Category Description'].apply(
    lambda x: f'FT_{x}' if pd.notna(x) else x
)

# Fill NaN in Category Description with values from Scale ID column
task_ratings_df['Category Description'] = task_ratings_df['Category Description'].fillna(task_ratings_df['Scale ID'])

# Replace IM with Importance and RT with Relevance
task_ratings_df['Category Description'] = task_ratings_df['Category Description'].replace({
    'IM': 'Importance',
    'RT': 'Relevance'
})

In [68]:
# Reshape from long to wide format
task_ratings_wide = task_ratings_df.pivot_table(
    index=['O*NET-SOC Code', 'Task ID'],
    columns='Category Description',
    values='Data Value',
    aggfunc='first'  # Use 'first' in case there are duplicates
).reset_index()

# Flatten the column names
task_ratings_wide.columns.name = None
task_ratings_wide

Unnamed: 0,O*NET-SOC Code,Task ID,FT_Daily,FT_Hourly or more,FT_More than monthly,FT_More than weekly,FT_More than yearly,FT_Several times daily,FT_Yearly or less,Importance,Relevance
0,11-1011.00,8823,46.67,5.26,11.04,16.19,9.16,7.33,4.34,4.54,94.19
1,11-1011.00,8824,25.27,4.81,27.41,15.58,11.14,14.21,1.59,4.15,98.79
2,11-1011.00,8825,35.11,3.73,12.61,18.96,19.04,10.56,0.00,4.40,100.00
3,11-1011.00,8826,38.47,6.38,10.18,23.83,9.27,8.70,3.17,4.39,95.84
4,11-1011.00,8827,7.87,0.65,18.43,10.23,38.58,0.00,24.24,4.17,90.47
...,...,...,...,...,...,...,...,...,...,...,...
17948,53-7121.00,12806,44.97,19.77,3.01,7.68,2.41,22.16,0.00,4.29,65.84
17949,53-7121.00,12807,27.65,8.34,6.88,13.95,29.21,7.93,6.05,4.08,64.04
17950,53-7121.00,12808,34.11,18.45,1.89,12.36,0.00,32.43,0.75,4.43,60.24
17951,53-7121.00,12809,38.33,25.23,0.36,7.31,0.00,28.08,0.70,4.48,73.20


<br>

### Merge DWAs to Dataset Above

In [69]:
# Read Tasks to DWAs file
tasks_to_dwas_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Tasks to DWAs.txt', sep='\t')

# Merge task ratings wide with Tasks to DWAs
task_ratings_with_dwas = pd.merge(task_ratings_wide,
    tasks_to_dwas_df[['O*NET-SOC Code', 'Task ID', 'DWA ID']], 
    on=['O*NET-SOC Code', 'Task ID'], 
    how='left'
)
task_ratings_with_dwas

Unnamed: 0,O*NET-SOC Code,Task ID,FT_Daily,FT_Hourly or more,FT_More than monthly,FT_More than weekly,FT_More than yearly,FT_Several times daily,FT_Yearly or less,Importance,Relevance,DWA ID
0,11-1011.00,8823,46.67,5.26,11.04,16.19,9.16,7.33,4.34,4.54,94.19,4.A.4.b.4.I09.D02
1,11-1011.00,8824,25.27,4.81,27.41,15.58,11.14,14.21,1.59,4.15,98.79,4.A.4.a.2.I03.D14
2,11-1011.00,8825,35.11,3.73,12.61,18.96,19.04,10.56,0.00,4.40,100.00,4.A.2.a.4.I07.D09
3,11-1011.00,8826,38.47,6.38,10.18,23.83,9.27,8.70,3.17,4.39,95.84,4.A.2.b.1.I09.D01
4,11-1011.00,8826,38.47,6.38,10.18,23.83,9.27,8.70,3.17,4.39,95.84,4.A.2.b.4.I01.D01
...,...,...,...,...,...,...,...,...,...,...,...,...
22305,53-7121.00,12807,27.65,8.34,6.88,13.95,29.21,7.93,6.05,4.08,64.04,4.A.3.a.2.I34.D01
22306,53-7121.00,12808,34.11,18.45,1.89,12.36,0.00,32.43,0.75,4.43,60.24,4.A.1.b.1.I01.D03
22307,53-7121.00,12809,38.33,25.23,0.36,7.31,0.00,28.08,0.70,4.48,73.20,4.A.3.a.3.I02.D03
22308,53-7121.00,12810,34.14,11.78,6.46,14.46,10.85,16.39,5.91,3.53,47.84,4.A.1.b.3.I01.D14


<br>

### Merge Job Zones to Dataset Above

In [70]:
# Read Job Zones file
job_zones_df = pd.read_csv(f'{input_data_path}/db_27_3_text/Job Zones.txt', sep='\t')

# Merge with Job Zones
task_ratings_with_job_zones = task_ratings_with_dwas.merge(
    job_zones_df[['O*NET-SOC Code', 'Job Zone']],
    on=['O*NET-SOC Code'],
    how='left'
)
task_ratings_with_job_zones

Unnamed: 0,O*NET-SOC Code,Task ID,FT_Daily,FT_Hourly or more,FT_More than monthly,FT_More than weekly,FT_More than yearly,FT_Several times daily,FT_Yearly or less,Importance,Relevance,DWA ID,Job Zone
0,11-1011.00,8823,46.67,5.26,11.04,16.19,9.16,7.33,4.34,4.54,94.19,4.A.4.b.4.I09.D02,5
1,11-1011.00,8824,25.27,4.81,27.41,15.58,11.14,14.21,1.59,4.15,98.79,4.A.4.a.2.I03.D14,5
2,11-1011.00,8825,35.11,3.73,12.61,18.96,19.04,10.56,0.00,4.40,100.00,4.A.2.a.4.I07.D09,5
3,11-1011.00,8826,38.47,6.38,10.18,23.83,9.27,8.70,3.17,4.39,95.84,4.A.2.b.1.I09.D01,5
4,11-1011.00,8826,38.47,6.38,10.18,23.83,9.27,8.70,3.17,4.39,95.84,4.A.2.b.4.I01.D01,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22305,53-7121.00,12807,27.65,8.34,6.88,13.95,29.21,7.93,6.05,4.08,64.04,4.A.3.a.2.I34.D01,2
22306,53-7121.00,12808,34.11,18.45,1.89,12.36,0.00,32.43,0.75,4.43,60.24,4.A.1.b.1.I01.D03,2
22307,53-7121.00,12809,38.33,25.23,0.36,7.31,0.00,28.08,0.70,4.48,73.20,4.A.3.a.3.I02.D03,2
22308,53-7121.00,12810,34.14,11.78,6.46,14.46,10.85,16.39,5.91,3.53,47.84,4.A.1.b.3.I01.D14,2
