#### By: Peyman Shahidi
#### Created: Oct 19, 2025
#### Last Edit: Oct 31, 2025

<br>

In [18]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [19]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects'
output_plot_path = f"{main_folder_path}/writeup/plots/anthropic_AI_index"

In [20]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [21]:
ONET = pd.read_csv(f'{input_data_path}/computed_objects/ONET_cleaned_tasks.csv')

In [22]:
# Build unique mapping between DWA and Tasks and save as CSV
def _find_col(columns, keywords):
    lowered = [c.lower() for c in columns]
    for kw in keywords:
        for i,c in enumerate(lowered):
            if kw in c:
                return columns[i]
    return None

dwa_id_col = _find_col(ONET.columns, ['dwa id','dwa_id','dwaid','work activity id','work_activity_id','dwa'])
dwa_title_col = _find_col(ONET.columns, ['dwa title','dwa_title','work activity title','work_activity_title','dwa title'])
task_id_col = _find_col(ONET.columns, ['task id','task_id','taskid','task id'])
task_title_col = _find_col(ONET.columns, ['task title','task_title','task','task name','task description'])

cols_map = {
    'dwa_id': dwa_id_col,
    'dwa_title': dwa_title_col,
    'task_id': task_id_col,
    'task_title': task_title_col,
}
print('Detected columns:')
for k,v in cols_map.items():
    print(f'  {k}: {v}')

required = [v for v in cols_map.values() if v is not None]
if len(required) < 4:
    raise ValueError('Could not automatically find all required columns in ONET. Columns available: ' + ', '.join(ONET.columns))

# Select relevant columns, drop rows with missing values, dedupe and rename to a stable schema
df_mapping = ONET[[dwa_id_col, dwa_title_col, task_id_col, task_title_col]].dropna(subset=[dwa_id_col, dwa_title_col, task_id_col, task_title_col])
df_mapping = df_mapping.drop_duplicates().rename(columns={dwa_id_col: 'DWA ID', dwa_title_col: 'DWA Title', task_id_col: 'Task ID', task_title_col: 'Task Title'})
df_mapping = df_mapping.sort_values(['DWA ID','Task ID']).reset_index(drop=True)

csv_path = f'{output_data_path}/dwa_task_mapping.csv'
df_mapping.to_csv(csv_path, index=False)
print(f'Saved mapping to {csv_path} — {len(df_mapping)} rows.')

# Also save unique DWA list
dwa_unique = df_mapping[['DWA ID','DWA Title']].drop_duplicates().sort_values('DWA ID').reset_index(drop=True)
dwa_unique.to_csv(f'{output_data_path}/unique_dwa.csv', index=False)
print(f'Saved unique DWA list to {output_data_path}/unique_dwa.csv — {len(dwa_unique)} rows.')


Detected columns:
  dwa_id: DWA ID
  dwa_title: DWA Title
  task_id: Task ID
  task_title: Task Title
Saved mapping to ../data/computed_objects/dwa_task_mapping.csv — 21885 rows.
Saved unique DWA list to ../data/computed_objects/unique_dwa.csv — 2081 rows.
