#### By: Peyman Shahidi
#### Created: Nov 8, 2025
#### Last Edit: Nov 8, 2025

<br>

In [10]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [11]:
import subprocess
import os


# Install caffeinate package
%pip install caffeinate

# Use macOS built-in caffeinate command for reliability
# This prevents the system from sleeping while the process is running
try:
    # Start caffeinate in the background
    caff_process = subprocess.Popen(['caffeinate', '-d'], 
                                   stdout=subprocess.DEVNULL, 
                                   stderr=subprocess.DEVNULL)
    print(f"Caffeinate mode ON ‚òï ‚Äì Device will stay awake (PID: {caff_process.pid})")
    print("System sleep is disabled while this process runs.")
    
    # Store the process ID for later cleanup
    caff_pid = caff_process.pid
    
except Exception as e:
    print(f"‚ö†Ô∏è Could not start caffeinate: {e}")
    print("Continuing without caffeinate - system may sleep during long processes.")
    caff_process = None
    caff_pid = None

Note: you may need to restart the kernel to use updated packages.
Caffeinate mode ON ‚òï ‚Äì Device will stay awake (PID: 51770)
System sleep is disabled while this process runs.


In [12]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects/similar_dwa_tasks'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [13]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [14]:
ONET = pd.read_csv(f'{input_data_path}/computed_objects/ONET_cleaned_tasks.csv')

In [15]:
# Build unique mapping between DWA and Tasks and save as CSV
dwa_id_col = 'DWA ID'
dwa_title_col = 'DWA Title'
task_id_col = 'Task ID'
task_title_col = 'Task Title'
occupation_code_col = 'O*NET-SOC Code'
occupation_title_col = 'Occupation Title'

cols_map = {
    'dwa_id': dwa_id_col,
    'dwa_title': dwa_title_col,
    'task_id': task_id_col,
    'task_title': task_title_col,
}
print('Detected columns:')
for k,v in cols_map.items():
    print(f'  {k}: {v}')

required = [v for v in cols_map.values() if v is not None]
if len(required) < 4:
    raise ValueError('Could not automatically find all required columns in ONET. Columns available: ' + ', '.join(ONET.columns))

# Select relevant columns, drop rows with missing values, dedupe and rename to a stable schema
dwa_task_mapping = ONET[[dwa_id_col, dwa_title_col, task_id_col, task_title_col, occupation_code_col, occupation_title_col]].dropna(subset=[dwa_id_col, dwa_title_col, task_id_col, task_title_col, occupation_code_col, occupation_title_col])
dwa_task_mapping = dwa_task_mapping.drop_duplicates().rename(columns={dwa_id_col: 'DWA ID', dwa_title_col: 'DWA Title', task_id_col: 'Task ID', task_title_col: 'Task Title', occupation_code_col: 'O*NET-SOC Code', occupation_title_col: 'Occupation Title'})
dwa_task_mapping = dwa_task_mapping.sort_values(['DWA ID','Task ID']).reset_index(drop=True)

csv_path = f'{output_data_path}/dwa_task_mapping.csv'
dwa_task_mapping.to_csv(csv_path, index=False)
print(f'Saved mapping to {csv_path} ‚Äî {len(dwa_task_mapping)} rows.')

# Also save unique DWA list
dwa_unique = dwa_task_mapping[['DWA ID','DWA Title']].drop_duplicates().sort_values('DWA ID').reset_index(drop=True)
dwa_unique.to_csv(f'{output_data_path}/unique_dwa.csv', index=False)
print(f'Saved unique DWA list to {output_data_path}/unique_dwa.csv ‚Äî {len(dwa_unique)} rows.')

Detected columns:
  dwa_id: DWA ID
  dwa_title: DWA Title
  task_id: Task ID
  task_title: Task Title
Saved mapping to ../data/computed_objects/similar_dwa_tasks/dwa_task_mapping.csv ‚Äî 21885 rows.
Saved unique DWA list to ../data/computed_objects/similar_dwa_tasks/unique_dwa.csv ‚Äî 2081 rows.


In [16]:
# Load DWA-Task mappings data
dwa_task_mapping = pd.read_csv(f'{output_data_path}/dwa_task_mapping.csv')

# Print number of unique DWAs and tasks
num_unique_dwas = dwa_task_mapping['DWA Title'].nunique()
num_unique_tasks = dwa_task_mapping['Task Title'].nunique()
print(f"Before Dropping Single-Task DWAs:\n{num_unique_dwas} unique DWAs and {num_unique_tasks} unique tasks.\n")

# Only keep DWAs with more than one tasks
tasks_per_dwa = dwa_task_mapping.groupby('DWA Title')['Task Title'].nunique().reset_index()
tasks_per_dwa = tasks_per_dwa.rename(columns={'Task Title': 'Num Tasks'})
tasks_per_dwa = tasks_per_dwa[tasks_per_dwa['Num Tasks'] > 1]
tasks_per_dwa_list = tasks_per_dwa['DWA Title'].tolist()

# Drop DWAs with only one task from the mapping
dwa_task_mapping = dwa_task_mapping[dwa_task_mapping['DWA Title'].isin(tasks_per_dwa_list)]

# Print number of unique DWAs and tasks after filtering
num_unique_dwas = dwa_task_mapping['DWA Title'].nunique()
num_unique_tasks = dwa_task_mapping['Task Title'].nunique()
print(f"After Dropping Single-Task DWAs:\n{num_unique_dwas} unique DWAs and {num_unique_tasks} unique tasks.")

Before Dropping Single-Task DWAs:
2081 unique DWAs and 16491 unique tasks.

After Dropping Single-Task DWAs:
2052 unique DWAs and 16470 unique tasks.


In [17]:
from edsl import QuestionFreeText, Scenario, Model, Survey
from textwrap import dedent
import json
import os
import pandas as pd
import numpy as np

def extract_task_sequence(dwa, tasks_data, output_data_path):
    """
    Extract task sequence for a DWA using EDSL workflow.
    Returns the ordered sequence of tasks.
    """
    # Check if output file already exists
    safe_title = dwa.replace(" ", "_").replace("/", "_")
    output_folder = f'{output_data_path}/similarTasks'
    output_file = os.path.join(output_folder, f"{safe_title}.csv")
    
    if os.path.exists(output_file):
        return output_file, True  # Return file path and flag indicating it already existed
    
    # Check if we have tasks for this DWA
    if tasks_data.empty:
        print(f"‚ö†Ô∏è  Warning: No tasks found for DWA '{dwa}' - skipping")
        return None, True  # Treat as already processed to skip
    
    # Create task mappings
    task_id_mapping = dict(zip(tasks_data['Task Title'], tasks_data['Task ID']))
    dwa_code_mapping = dict(zip(tasks_data['Task Title'], tasks_data['DWA ID']))
    occupation_code_mapping = dict(zip(tasks_data['Task Title'], tasks_data['O*NET-SOC Code']))
    occupation_title_mapping = dict(zip(tasks_data['Task Title'], tasks_data['Occupation Title']))

    
    # Format tasks as numbered list
    tasks_list = tasks_data['Task Title'].tolist()
    occupations_list = tasks_data['Occupation Title'].tolist()
    tasks_text = "\n".join([f"{i}. {task}" for i, task in enumerate(tasks_list, 1)])
    occupations_text = "\n".join([f"{i}. {task}" for i, task in enumerate(occupations_list, 1)])
    num_tasks = len(tasks_list)
    max_tokens = 32000
    
    print(f"   ‚Ä¢ {num_tasks} tasks, using {max_tokens} max tokens")

    # Create scenario
    scenario = Scenario({
        "detailed_work_activity": dwa,
        "tasks_list": tasks_text,
        "occupations_list": occupations_text,
        "num_tasks": num_tasks
    })

    # Create question for task sequencing
    q_sequence = QuestionFreeText(
        question_name="similar_tasks",
        question_text=dedent("""\
            You are an expert in workflow analysis for the detailed work activity: {{ detailed_work_activity }}.
            Below is a list of {{ num_tasks }} tasks that belong to this detailed work activity and appear across similar or different occupations (tasks and occupations are ordered such that the first task belongs to the first occupation, the second task belongs to the second occupation, etc.).
            Tasks list: 
            {{ tasks_list }}
            \n
            Occupations list:
            {{ occupations_list }}
            \n
            Determine which tasks within the detailed work activity are similar in nature and in terms of their objectives, methods, or required skills.
            Pick only a single task from each occupation.
            You do not have to include all tasks or a task from all occupations. Only look for those tasks that are actually similar.
            Return the task-occupation pairs you determine as similar as a JSON array where each element has:
            - "Task ID": the exact task ID from the list of tasks above
            - "Task Title": the exact task text from the list of tasks above
            - "O*NET-SOC Code": the exact O*NET-SOC Code from the list of occupations above
            - "Occupation Title": the exact occupation text from the list of occupations above
            Format: [{"Task ID": 1234, "Task Title": "..."}, {"Task ID": 5678, "Task Title": "..."}, ...]
            Only return the JSON array, nothing else.
        """)
    )

    try:
        # Create model using openai_v2 for reasoning capabilities
        model = Model("gpt-5-mini", service_name="openai_v2", temperature=0.0, max_tokens=max_tokens)

        # Run similarity question
        similar_results = q_sequence.by(model).by([scenario]).run(progress_bar=False)
        similar_df = similar_results.to_pandas()
        similar_json = similar_df['answer.similar_tasks'][0]
        
        # Debug: Print the raw response before cleaning
        print(f"   ‚Ä¢ Raw JSON length: {len(str(similar_json))}")
        print(f"   ‚Ä¢ Raw JSON preview: {str(similar_json)[:50]}...")
        
        # Clean the JSON response by removing markdown code blocks if present
        if isinstance(similar_json, str):
            # Simple string replacement approach
            cleaned_json = similar_json
            if '```json' in cleaned_json:
                cleaned_json = cleaned_json.replace('```json', '')
            if '```' in cleaned_json:
                cleaned_json = cleaned_json.replace('```', '')
            similar_json = cleaned_json.strip()
            print(f"   ‚Ä¢ Cleaned JSON preview: {similar_json[:50]}...")
        
        # Check if the response is valid
        if pd.isna(similar_json) or not isinstance(similar_json, str):
            print(f"‚ùå Error: Invalid response for '{dwa}' - got {type(similar_json)} instead of string")
            return None, True  # Treat as already processed to skip
        
        # Try to parse JSON
        try:
            similar_data = json.loads(similar_json)
        except json.JSONDecodeError as e:
            print(f"‚ùå JSON parsing failed, trying to clean response further...")
            print(f"   Original error: {e}")
            print(f"   Response starts with: {similar_json[:100]}...")
            # Try additional cleanup
            if similar_json.startswith('```'):
                lines = similar_json.split('\n')
                if lines[0].strip() in ['```', '```json']:
                    lines = lines[1:]  # Remove first line
                if lines[-1].strip() == '```':
                    lines = lines[:-1]  # Remove last line
                similar_json = '\n'.join(lines).strip()
                print(f"   Cleaned response starts with: {similar_json[:100]}...")
                try:
                    similar_data = json.loads(similar_json)
                    print(f"   ‚úÖ Successfully parsed after additional cleanup")
                except json.JSONDecodeError as e2:
                    print(f"   ‚ùå Still failed after cleanup: {e2}")
                    raise e  # Re-raise original error
            else:
                raise e  # Re-raise original error

        similar_tasks_df = pd.DataFrame(similar_data)

        # Add metadata columns
        similar_tasks_df['DWA Title'] = dwa
        similar_tasks_df['Task ID'] = similar_tasks_df['Task Title'].map(task_id_mapping)
        similar_tasks_df['DWA ID'] = similar_tasks_df['Task Title'].map(dwa_code_mapping)
        similar_tasks_df['O*NET-SOC Code'] = similar_tasks_df['Task Title'].map(occupation_code_mapping)
        similar_tasks_df['Occupation Title'] = similar_tasks_df['Task Title'].map(occupation_title_mapping)
        
        # Reorder columns
        similar_tasks_df = similar_tasks_df[['DWA ID', 'DWA Title', 'Task Title', 'Task ID', 'O*NET-SOC Code', 'Occupation Title']]

        # Save to file
        os.makedirs(output_folder, exist_ok=True)
        similar_tasks_df.to_csv(output_file, index=False)

        print(f"   ‚úÖ Successfully processed and saved task sequence")
        return output_file, False  # Return file path and flag indicating it was newly created
        
    except json.JSONDecodeError as e:
        print(f"‚ùå JSON Error for '{dwa}': {e}")
        print(f"   Raw response: {similar_json}")
        return None, True  # Treat as already processed to skip
    except Exception as e:
        print(f"‚ùå Unexpected error for '{dwa}': {e}")
        return None, True  # Treat as already processed to skip


In [None]:
# Get all unique occupation titles from the dataset
dwa_list = sorted(dwa_task_mapping['DWA Title'].unique().tolist())
print(f"Found {len(dwa_list)} unique occupations in the dataset:")

# Set seed for reproducible random sampling
random.seed(42)
np.random.seed(42)

# # Randomly sample 10% of occupations
# sample_size = max(1, int(len(occupations_list) * 0.10))  # Ensure at least 1 occupation
# sampled_occupations = random.sample(occupations_list, sample_size)
# print(f"Randomly selected {len(sampled_occupations)} occupations (5% of total) for processing:")
# print(f"Sample: {sampled_occupations[:5]}..." if len(sampled_occupations) > 5 else f"Sample: {sampled_occupations}")
sampled_dwas = dwa_list

# Process each occupation
processed_count = 0
skipped_count = 0
error_count = 0



for i, dwa in enumerate(sampled_dwas, 1):
    # Filter data for this DWA
    dwa_data = dwa_task_mapping[dwa_task_mapping['DWA Title'] == dwa].copy()

    # Prepare task data
    dwa_task_data = dwa_data[['Task ID', 'Task Title', 'O*NET-SOC Code', 'Occupation Title', 'DWA ID']].drop_duplicates().reset_index(drop=True)
    
    # Enhanced progress output
    num_tasks = len(dwa_task_data)
    print(f"\n[{i}/{len(sampled_dwas)}] {dwa}")

    # Extract task sequence
    output_file, already_existed = extract_task_sequence(dwa, dwa_task_data, output_data_path)

    if output_file is None:
        error_count += 1
    elif already_existed:
        print(f"   ‚è≠Ô∏è  Already exists - skipping")
        skipped_count += 1
    else:
        processed_count += 1

# Summary
print(f"\n" + "="*50)
print(f"PROCESSING COMPLETE")
print(f"="*50)
print(f"‚Ä¢ {processed_count} DWAs processed")
print(f"‚Ä¢ {skipped_count} DWAs skipped (already existed)")
print(f"‚Ä¢ {error_count} DWAs failed")
print(f"‚Ä¢ {len(sampled_dwas)} total DWAs in sample")

Found 2052 unique occupations in the dataset:

[1/1] Administer tests to assess educational needs or progress.
   ‚Ä¢ 57 tasks, using 32000 max tokens


   ‚Ä¢ Raw JSON length: 11409
   ‚Ä¢ Raw JSON preview: [{"Task ID":1,"Task Title":"Compile, administer, a...
   ‚Ä¢ Cleaned JSON preview: [{"Task ID":1,"Task Title":"Compile, administer, a...
   ‚úÖ Successfully processed and saved task sequence

PROCESSING COMPLETE
‚Ä¢ 1 DWAs processed
‚Ä¢ 0 DWAs skipped (already existed)
‚Ä¢ 0 DWAs failed
‚Ä¢ 1 total DWAs in sample


In [19]:
# Clean up caffeinate process
try:
    if 'caff_process' in globals() and caff_process is not None:
        caff_process.terminate()
        caff_process.wait()  # Wait for process to terminate
        print("Caffeinate mode OFF üí° - System sleep is now enabled.")
    else:
        print("Caffeinate was not running or already stopped.")
except Exception as e:
    print(f"Note: {e}")
    print("Caffeinate process may have already ended.")

Caffeinate mode OFF üí° - System sleep is now enabled.
