#### By: Peyman Shahidi
#### Created: Nov 8, 2025
#### Last Edit: Nov 8, 2025

<br>

In [None]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [None]:
import subprocess
import os


# Install caffeinate package
%pip install caffeinate

# Use macOS built-in caffeinate command for reliability
# This prevents the system from sleeping while the process is running
try:
    # Start caffeinate in the background
    caff_process = subprocess.Popen(['caffeinate', '-d'], 
                                   stdout=subprocess.DEVNULL, 
                                   stderr=subprocess.DEVNULL)
    print(f"Caffeinate mode ON ‚òï ‚Äì Device will stay awake (PID: {caff_process.pid})")
    print("System sleep is disabled while this process runs.")
    
    # Store the process ID for later cleanup
    caff_pid = caff_process.pid
    
except Exception as e:
    print(f"‚ö†Ô∏è Could not start caffeinate: {e}")
    print("Continuing without caffeinate - system may sleep during long processes.")
    caff_process = None
    caff_pid = None

In [None]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects/similar_dwa_tasks'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [None]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
ONET = pd.read_csv(f'{input_data_path}/computed_objects/ONET_cleaned_tasks.csv')

In [None]:
# Build unique mapping between DWA and Tasks and save as CSV
dwa_id_col = 'DWA ID'
dwa_title_col = 'DWA Title'
task_id_col = 'Task ID'
task_title_col = 'Task Title'
occupation_code_col = 'O*NET-SOC Code'
occupation_title_col = 'Occupation Title'

cols_map = {
    'dwa_id': dwa_id_col,
    'dwa_title': dwa_title_col,
    'task_id': task_id_col,
    'task_title': task_title_col,
}
print('Detected columns:')
for k,v in cols_map.items():
    print(f'  {k}: {v}')

required = [v for v in cols_map.values() if v is not None]
if len(required) < 4:
    raise ValueError('Could not automatically find all required columns in ONET. Columns available: ' + ', '.join(ONET.columns))

# Select relevant columns, drop rows with missing values, dedupe and rename to a stable schema
dwa_task_mapping = ONET[[dwa_id_col, dwa_title_col, task_id_col, task_title_col, occupation_code_col, occupation_title_col]].dropna(subset=[dwa_id_col, dwa_title_col, task_id_col, task_title_col, occupation_code_col, occupation_title_col])
dwa_task_mapping = dwa_task_mapping.drop_duplicates().rename(columns={dwa_id_col: 'DWA ID', dwa_title_col: 'DWA Title', task_id_col: 'Task ID', task_title_col: 'Task Title', occupation_code_col: 'O*NET-SOC Code', occupation_title_col: 'Occupation Title'})
dwa_task_mapping = dwa_task_mapping.sort_values(['DWA ID','Task ID']).reset_index(drop=True)

csv_path = f'{output_data_path}/dwa_task_mapping.csv'
dwa_task_mapping.to_csv(csv_path, index=False)
print(f'Saved mapping to {csv_path} ‚Äî {len(dwa_task_mapping)} rows.')

# Also save unique DWA list
dwa_unique = dwa_task_mapping[['DWA ID','DWA Title']].drop_duplicates().sort_values('DWA ID').reset_index(drop=True)
dwa_unique.to_csv(f'{output_data_path}/unique_dwa.csv', index=False)
print(f'Saved unique DWA list to {output_data_path}/unique_dwa.csv ‚Äî {len(dwa_unique)} rows.')

In [None]:
# Load DWA-Task mappings data
dwa_task_mapping = pd.read_csv(f'{output_data_path}/dwa_task_mapping.csv')

# Print number of unique DWAs and tasks
num_unique_dwas = dwa_task_mapping['DWA Title'].nunique()
num_unique_tasks = dwa_task_mapping['Task Title'].nunique()
print(f"Before Dropping Single-Task DWAs:\n{num_unique_dwas} unique DWAs and {num_unique_tasks} unique tasks.\n")

# Only keep DWAs with more than one tasks
tasks_per_dwa = dwa_task_mapping.groupby('DWA Title')['Task Title'].nunique().reset_index()
tasks_per_dwa = tasks_per_dwa.rename(columns={'Task Title': 'Num Tasks'})
tasks_per_dwa = tasks_per_dwa[tasks_per_dwa['Num Tasks'] > 1]
tasks_per_dwa_list = tasks_per_dwa['DWA Title'].tolist()

# Drop DWAs with only one task from the mapping
dwa_task_mapping = dwa_task_mapping[dwa_task_mapping['DWA Title'].isin(tasks_per_dwa_list)]

# Print number of unique DWAs and tasks after filtering
num_unique_dwas = dwa_task_mapping['DWA Title'].nunique()
num_unique_tasks = dwa_task_mapping['Task Title'].nunique()
print(f"After Dropping Single-Task DWAs:\n{num_unique_dwas} unique DWAs and {num_unique_tasks} unique tasks.")

In [None]:
from edsl import QuestionFreeText, Scenario, Model, Survey
from textwrap import dedent
import json
import os
import pandas as pd
import numpy as np

def extract_task_sequence(dwa, tasks_data, output_data_path):
    """
    Extract task sequence for a DWA using EDSL workflow.
    Returns the ordered sequence of tasks.
    """
    # Check if output file already exists
    safe_title = dwa.replace(" ", "_").replace("/", "_")
    output_folder = f'{output_data_path}/similarTasks'
    output_file = os.path.join(output_folder, f"{safe_title}csv") # no dot before csv because DWAs end with dot themselves

    if os.path.exists(output_file):
        return output_file, True  # Return file path and flag indicating it already existed
    
    # Check if we have tasks for this DWA
    if tasks_data.empty:
        print(f"‚ö†Ô∏è  Warning: No tasks found for DWA '{dwa}' - skipping")
        return None, True  # Treat as already processed to skip

    # Format tasks as numbered list
    tasks_title_list = tasks_data['Task Title'].tolist()
    tasks_ids_list = tasks_data['Task ID'].tolist()
    occupations_list = tasks_data['Occupation Title'].tolist()
    occupation_codes_list = tasks_data['O*NET-SOC Code'].tolist()

    tasks_text = "\n".join([f"{i}. {task}" for i, task in enumerate(tasks_title_list, 1)])
    tasks_ids_text = "\n".join(f"{i}. {task_id}" for i, task_id in enumerate(tasks_ids_list, 1))
    occupations_text = "\n".join([f"{i}. {occupation}" for i, occupation in enumerate(occupations_list, 1)])
    occupation_codes_text = "\n".join([f"{i}. {occupation_code}" for i, occupation_code in enumerate(occupation_codes_list, 1)])
    num_tasks = len(tasks_title_list)
    max_tokens = 32000

    print(f"   ‚Ä¢ {num_tasks} tasks, using {max_tokens} max tokens")

    # Create scenario
    scenario = Scenario({
        "detailed_work_activity": dwa,
        "tasks_list": tasks_text,
        "tasks_ids": tasks_ids_text,
        "occupations_list": occupations_text,
        "occupation_codes_list": occupation_codes_text,
        "num_tasks": num_tasks
    })

    # Create question for task sequencing
    q_sequence = QuestionFreeText(
        question_name="similar_tasks",
        question_text=dedent("""\
            You are an expert in workflow analysis for the detailed work activity: {{ detailed_work_activity }}.
            Below is a list of {{ num_tasks }} task IDs and titles that belong to this detailed work activity and appear across similar or different occupations (tasks and occupations are ordered such that the first task belongs to the first occupation, the second task belongs to the second occupation, etc.).
            Tasks IDs:
            {{ tasks_ids }}
            \n
            Tasks list: 
            {{ tasks_list }}
            \n
            Occupations list:
            {{ occupations_list }}
            \n
            Occupation Codes list:
            {{ occupation_codes_list }}
            \n
            Determine which tasks are similar in nature and in terms of their objectives, methods, or required skills.
            There may be more than one task associated with an occupation. Return only the most relevant task for every occupation.
            Only look for tasks that are actually similar. Do not feel obliged to return all occupations.\n
            Return the task-occupation pairs you determine as similar as a JSON array where each element has:
            - "Task ID": the exact task ID from the list of task IDs above
            - "Task Title": the exact task text from the list of tasks above
            - "O*NET-SOC Code": the exact occupation code text from the list of occupation codes above
            - "Occupation Title": the exact occupation text from the list of occupations above
            Format: [{"Task ID": 1234, "Task Title": "...", "O*NET-SOC Code": "...", "Occupation Title": "..."}, {"Task ID": 5678, "Task Title": "...", "O*NET-SOC Code": "...", "Occupation Title": "..."}, ...]
            Only return the JSON array, nothing else.
        """)
    )

    try:
        # Create model using openai_v2 for reasoning capabilities
        model = Model("gpt-5-mini", service_name="openai_v2", temperature=0.0, max_tokens=max_tokens)

        # Run similarity question
        similar_results = q_sequence.by(model).by([scenario]).run(progress_bar=False)
        similar_df = similar_results.to_pandas()
        
        # -------------------------------
        # Robustly normalize/clean the answer column to avoid NaN/type issues
        # -------------------------------
        col = 'answer.similar_tasks'

        # 1) Normalize cell types into JSON strings (or <NA>)
        def _to_json_str(v):
            if isinstance(v, str):
                return v
            if isinstance(v, (list, dict)):
                return json.dumps(v)
            if v is None:
                return pd.NA
            if isinstance(v, float) and np.isnan(v):
                return pd.NA
            # fallback: last-resort string
            return str(v)

        similar_df[col] = (
            similar_df[col]
            .apply(_to_json_str)
            .astype('string')        # pandas StringDtype, keeps <NA>
            .fillna('[]')            # robust default to empty list
            .str.strip()
            # strip possible markdown code fences
            .str.replace(r'^\s*```json\s*', '', regex=True)
            .str.replace(r'\s*```\s*$', '', regex=True)
        )

        # Debug: Print the raw response before cleaning (safely)
        val = similar_df[col].iat[0]
        if isinstance(val, str):
            print(f"   ‚Ä¢ Raw/Clean JSON length: {len(val)}")
            print(f"   ‚Ä¢ Raw/Clean JSON preview: {val[:50]}...")
        else:
            print(f"   ‚Ä¢ Non-string value in {col}: {type(val)}")

        # 2) Parse JSON
        similar_json = similar_df[col].iat[0]
        try:
            similar_data = json.loads(similar_json or '[]')
        except json.JSONDecodeError as e:
            print(f"‚ùå JSON parsing failed, trying to clean response further...")
            print(f"   Original error: {e}")
            print(f"   Response starts with: {similar_json[:100]}...")
            # Try additional cleanup (strip stray backticks/newlines)
            cleaned = (similar_json or '').strip().strip('`').strip()
            try:
                similar_data = json.loads(cleaned or '[]')
                print(f"   ‚úÖ Successfully parsed after additional cleanup")
            except json.JSONDecodeError as e2:
                print(f"   ‚ùå Still failed after cleanup: {e2}")
                raise e  # Re-raise original error

        similar_tasks_df = pd.DataFrame(similar_data)

        # Add metadata columns
        similar_tasks_df['DWA Title'] = dwa
        similar_tasks_df['DWA ID'] = tasks_data['DWA ID'].iloc[0]
        
        # Reorder columns
        similar_tasks_df = similar_tasks_df[['DWA ID', 'DWA Title', 'Task ID', 'Task Title', 'O*NET-SOC Code', 'Occupation Title']]

        # Save to file
        os.makedirs(output_folder, exist_ok=True)
        similar_tasks_df.to_csv(output_file, index=False)

        print(f"   ‚úÖ Successfully processed and saved task sequence")
        return output_file, False  # Return file path and flag indicating it was newly created
        
    except json.JSONDecodeError as e:
        print(f"‚ùå JSON Error for '{dwa}': {e}")
        print(f"   Raw response: {similar_json}")
        return None, True  # Treat as already processed to skip
    except Exception as e:
        print(f"‚ùå Unexpected error for '{dwa}': {e}")
        return None, True  # Treat as already processed to skip


In [None]:
# Get all unique occupation titles from the dataset
dwa_list = sorted(dwa_task_mapping['DWA Title'].unique().tolist())
print(f"Found {len(dwa_list)} unique occupations in the dataset:")

# Set seed for reproducible random sampling
random.seed(42)
np.random.seed(42)

# # Randomly sample 10% of DWAs
# sample_size = max(1, int(len(dwa_list) * 0.10))  # Ensure at least 1 occupation
# sampled_dwas = random.sample(dwa_list, sample_size)
# print(f"Randomly selected {len(sampled_dwas)} DWAs (5% of total) for processing:")
# print(f"Sample: {sampled_dwas[:5]}..." if len(sampled_dwas) > 5 else f"Sample: {sampled_dwas}")
sampled_dwas = dwa_list

# Process each occupation
processed_count = 0
skipped_count = 0
error_count = 0


for i, dwa in enumerate(sampled_dwas, 1):
    print(dwa)
    # Filter data for this DWA
    dwa_data = dwa_task_mapping[dwa_task_mapping['DWA Title'] == dwa].copy()

    # Prepare task data
    dwa_task_data = dwa_data[['Task ID', 'Task Title', 'O*NET-SOC Code', 'Occupation Title', 'DWA ID']].drop_duplicates().reset_index(drop=True)
    
    # Enhanced progress output
    num_tasks = len(dwa_task_data)
    print(f"\n[{i}/{len(sampled_dwas)}] {dwa}")

    # Extract task sequence
    output_file, already_existed = extract_task_sequence(dwa, dwa_task_data, output_data_path)

    if output_file is None:
        error_count += 1
    elif already_existed:
        print(f"   ‚è≠Ô∏è  Already exists - skipping")
        skipped_count += 1
    else:
        processed_count += 1

# Summary
print(f"\n" + "="*50)
print(f"PROCESSING COMPLETE")
print(f"="*50)
print(f"‚Ä¢ {processed_count} DWAs processed")
print(f"‚Ä¢ {skipped_count} DWAs skipped (already existed)")
print(f"‚Ä¢ {error_count} DWAs failed")
print(f"‚Ä¢ {len(sampled_dwas)} total DWAs in sample")

In [None]:
# Clean up caffeinate process
try:
    if 'caff_process' in globals() and caff_process is not None:
        caff_process.terminate()
        caff_process.wait()  # Wait for process to terminate
        print("Caffeinate mode OFF üí° - System sleep is now enabled.")
    else:
        print("Caffeinate was not running or already stopped.")
except Exception as e:
    print(f"Note: {e}")
    print("Caffeinate process may have already ended.")