#### By: Peyman Shahidi
#### Created: Dec 16, 2025
#### Last Edit: Jan 25, 2026

<br>

In [None]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [None]:
import subprocess
import os


# Install caffeinate package
%pip install caffeinate

# Use macOS built-in caffeinate command for reliability
# This prevents the system from sleeping while the process is running
try:
    # Start caffeinate in the background
    caff_process = subprocess.Popen(['caffeinate', '-d'], 
                                   stdout=subprocess.DEVNULL, 
                                   stderr=subprocess.DEVNULL)
    print(f"Caffeinate mode ON ‚òï ‚Äì Device will stay awake (PID: {caff_process.pid})")
    print("System sleep is disabled while this process runs.")
    
    # Store the process ID for later cleanup
    caff_pid = caff_process.pid
    
except Exception as e:
    print(f"‚ö†Ô∏è Could not start caffeinate: {e}")
    print("Continuing without caffeinate - system may sleep during long processes.")
    caff_process = None
    caff_pid = None

In [None]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [None]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
# Load O*NET data and extract unique occupation titles
ONET = pd.read_csv(f'{output_data_path}/ONET_cleaned_tasks.csv')

In [None]:
from edsl import QuestionFreeText, Scenario, Model, Survey
from textwrap import dedent
import json
import os
import pandas as pd
import numpy as np

def extract_task_sequence(occupation, tasks_data, user_prompt, output_data_path, prompt_number=1):
    """
    Extract task sequence for an occupation using EDSL workflow.
    Returns the ordered sequence of tasks.
    """
    # Check if output file already exists
    safe_title = occupation.replace(" ", "_").replace("/", "_")
    # Create a folder per occupation and include the prompt number in the filename
    output_folder = f'{output_data_path}/tasks_sequences_robustness_repetitive/{safe_title}'
    output_file = os.path.join(output_folder, f"{safe_title}_{prompt_number}.csv")
    
    if os.path.exists(output_file):
        return output_file, True  # Return file path and flag indicating it already existed
    
    # Check if we have tasks for this occupation
    if tasks_data.empty:
        print(f"‚ö†Ô∏è  Warning: No tasks found for occupation '{occupation}' - skipping")
        return None, True  # Treat as already processed to skip
    
    # Create task mappings
    task_id_mapping = dict(zip(tasks_data['Task Title'], tasks_data['Task ID']))
    soc_code_mapping = dict(zip(tasks_data['Task Title'], tasks_data['O*NET-SOC Code']))

    # Format tasks as numbered list
    tasks_list = tasks_data['Task Title'].tolist()
    tasks_text = "\n".join([f"{i}. {task}" for i, task in enumerate(tasks_list, 1)])
    num_tasks = len(tasks_list)
    max_tokens = 32000
    
    print(f"   ‚Ä¢ {num_tasks} tasks, using {max_tokens} max tokens")

    # Create scenario
    scenario = Scenario({
        "occupation": occupation,
        "tasks_list": tasks_text,
        "num_tasks": num_tasks
    })

    # Create question for task sequencing using the chosen template
    q_sequence = QuestionFreeText(
        question_name="task_sequence",
        question_text=user_prompt
    )

    try:
        # Create model using openai_v2 for reasoning capabilities
        model = Model("gpt-5-mini", service_name="openai_v2", temperature=0.0, max_tokens=max_tokens)
        
        # Run sequence question
        sequence_results = q_sequence.by(model).by([scenario]).run(progress_bar=False)
        sequence_df = sequence_results.to_pandas()
        sequence_json = sequence_df['answer.task_sequence'][0]
        
        # Debug: Print the raw response before cleaning
        print(f"   ‚Ä¢ Raw JSON length: {len(str(sequence_json))}")
        print(f"   ‚Ä¢ Raw JSON preview: {str(sequence_json)[:50]}...")
        
        # Clean the JSON response by removing markdown code blocks if present
        if isinstance(sequence_json, str):
            # Simple string replacement approach
            cleaned_json = sequence_json
            if '```json' in cleaned_json:
                cleaned_json = cleaned_json.replace('```json', '')
            if '```' in cleaned_json:
                cleaned_json = cleaned_json.replace('```', '')
            sequence_json = cleaned_json.strip()
            print(f"   ‚Ä¢ Cleaned JSON preview: {sequence_json[:50]}...")
        
        # Check if the response is valid
        if pd.isna(sequence_json) or not isinstance(sequence_json, str):
            print(f"‚ùå Error: Invalid response for '{occupation}' - got {type(sequence_json)} instead of string")
            return None, True  # Treat as already processed to skip
        
        # Try to parse JSON
        try:
            sequence_data = json.loads(sequence_json)
        except json.JSONDecodeError as e:
            print(f"‚ùå JSON parsing failed, trying to clean response further...")
            print(f"   Original error: {e}")
            print(f"   Response starts with: {sequence_json[:100]}...")
            # Try additional cleanup
            if sequence_json.startswith('```'):
                lines = sequence_json.split('\n')
                if lines[0].strip() in ['```', '```json']:
                    lines = lines[1:]  # Remove first line
                if lines[-1].strip() == '```':
                    lines = lines[:-1]  # Remove last line
                sequence_json = '\n'.join(lines).strip()
                print(f"   Cleaned response starts with: {sequence_json[:100]}...")
                try:
                    sequence_data = json.loads(sequence_json)
                    print(f"   ‚úÖ Successfully parsed after additional cleanup")
                except json.JSONDecodeError as e2:
                    print(f"   ‚ùå Still failed after cleanup: {e2}")
                    raise e  # Re-raise original error
            else:
                raise e  # Re-raise original error
                
        ordered_sequence_df = pd.DataFrame(sequence_data)
        
        # Add metadata columns
        ordered_sequence_df['Occupation Title'] = occupation
        ordered_sequence_df['Task ID'] = ordered_sequence_df['Task Title'].map(task_id_mapping)
        ordered_sequence_df['O*NET-SOC Code'] = ordered_sequence_df['Task Title'].map(soc_code_mapping)
        
        # Reorder columns
        ordered_sequence_df = ordered_sequence_df[['Task Position', 'Task Title', 'Task ID', 'O*NET-SOC Code', 'Occupation Title']]

        # Save to file
        os.makedirs(output_folder, exist_ok=True)
        ordered_sequence_df.to_csv(output_file, index=False)
        
        print(f"   ‚úÖ Successfully processed and saved task sequence")
        return output_file, False  # Return file path and flag indicating it was newly created
        
    except json.JSONDecodeError as e:
        print(f"‚ùå JSON Error for '{occupation}': {e}")
        print(f"   Raw response: {sequence_json}")
        return None, True  # Treat as already processed to skip
    except Exception as e:
        print(f"‚ùå Unexpected error for '{occupation}': {e}")
        return None, True  # Treat as already processed to skip

In [None]:
prefix_text = dedent("""\You are an expert in workflow analysis for the occupation: {{ occupation }}.
           Below is a list of {{ num_tasks }} tasks that are part of this occupation:
           {{ tasks_list }}
    """)

return_characteristics_text = dedent("""\
    Return your answer as a JSON array where each element has:
        - "Task Position": the sequence number (1, 2, 3, etc.)
        - "Task Title": the exact task text from the list above
    Format: [{"Task Position": 1, "Task Title": "..."}, {"Task Position": 2, "Task Title": "..."}, ...]
    Only return the JSON array, nothing else.
""")


user_prompts_list = [
    # 1. Main spec ‚Äî preserved verbatim
    'Provide the typical sequential order in which these tasks are performed in a real-world workflow.',
    # 2. Narrative / temporal
    'Imagine a typical workday for this occupation. As the day unfolds, tasks arise and are completed as needed. Order the tasks in the sequence they most naturally occur.',
    # 3. Input‚Äìoutput logic
    'For each task, consider its inputs and outputs. Order tasks so outputs of earlier tasks plausibly feed into later tasks. If tasks are parallel, place the more upstream task first.',
    # 4. Efficiency without templates
    'Order tasks to minimize rework, waiting, and unnecessary handoffs. Assume an experienced worker executing the workflow efficiently.',
    # 5. Backward reasoning, then forward
    'Think about what must ultimately be produced in this occupation and what needs to happen before that. Use this reasoning to produce a natural forward sequence of tasks.',
    # 6. Dependency-first, no phases
    'Identify which tasks logically depend on others, then order the tasks in a single sequence consistent with those dependencies and typical practice.',
    # 7. Information flow, no staging
    'Order tasks according to how information is generated, transformed, and used over the course of the work.',
    # 8. Error prevention lens
    'Order tasks based on when mistakes would be most costly, placing tasks that prevent or constrain downstream errors earlier.',
    # 9. Decision salience
    'Order tasks so that tasks informing important decisions tend to occur before tasks that rely on those decisions.',
    # 10. Practitioner intuition
    'Order the tasks as an experienced practitioner would intuitively carry them out, without explicitly planning or formalizing the workflow.',
    # 11. Revealed practice
    'Order the tasks to reflect how the work is most commonly carried out in practice, rather than how it is formally described.'
]


In [None]:
# Get all unique occupation titles from the dataset
# occupations_list = sorted(ONET['Occupation Title'].unique().tolist())
repetitive_occupations = pd.read_csv(f"{input_data_path}/computed_objects/repetitive_onet_detailedOcc_occ_crosswalk.csv")
occupations_list = repetitive_occupations['Occupation Title'].unique().tolist()
print(f"Found {len(occupations_list)} unique occupations in the dataset:")

# Set seed for reproducible random sampling
random.seed(42)
np.random.seed(42)

# # Randomly sample 10% of occupations
# sample_size = max(1, int(len(occupations_list) * 0.1))  # Ensure at least 1 occupation
# sampled_occupations = random.sample(occupations_list, sample_size)
# print(f"Randomly selected {len(sampled_occupations)} occupations (5% of total) for processing:")
# print(f"Sample: {sampled_occupations[:5]}..." if len(sampled_occupations) > 5 else f"Sample: {sampled_occupations}")
sampled_occupations = occupations_list


# Process each occupation
processed_count = 0
skipped_count = 0
error_count = 0


for prompt_number, prompt_text in enumerate(user_prompts_list):
    # Combine prefix, prompt, and return characteristics into a single user prompt
    user_prompt = prefix_text + '\n' + prompt_text + '\n' + return_characteristics_text

    for i, occupation in enumerate(sampled_occupations, 1):
        # Filter data for this occupation
        occupation_data = ONET[ONET['Occupation Title'] == occupation].copy()
        
        # Prepare task data
        occupation_task_data = occupation_data[['Task ID', 'Task Title', 'O*NET-SOC Code']].drop_duplicates().reset_index(drop=True)
        
        # Enhanced progress output
        num_tasks = len(occupation_task_data)
        print(f"\n[{i}/{len(sampled_occupations)}] {occupation}")
        
        # Extract task sequence
        output_file, already_existed = extract_task_sequence(occupation, occupation_task_data, user_prompt, output_data_path, prompt_number)
        
        if output_file is None:
            error_count += 1
        elif already_existed:
            print(f"   ‚è≠Ô∏è  Already exists - skipping")
            skipped_count += 1
        else:
            processed_count += 1

    # Prompt-level summary
    print(f"\n{'='*50}")
    print(f"PROMPT {prompt_number+1}/{len(user_prompts_list)} COMPLETE")
    print(f"{'='*50}")
    print(f"‚Ä¢ {processed_count} occupations processed")
    print(f"‚Ä¢ {skipped_count} occupations skipped")
    print(f"‚Ä¢ {error_count} occupations failed")
    print(f"‚Ä¢ {len(sampled_occupations)} total occupations")

In [None]:
# Clean up caffeinate process
try:
    if 'caff_process' in globals() and caff_process is not None:
        caff_process.terminate()
        caff_process.wait()  # Wait for process to terminate
        print("Caffeinate mode OFF üí° - System sleep is now enabled.")
    else:
        print("Caffeinate was not running or already stopped.")
except Exception as e:
    print(f"Note: {e}")
    print("Caffeinate process may have already ended.")