#### By: Peyman Shahidi
#### Created: Oct 14, 2025
#### Last Edit: Oct 22, 2025

<br>

In [None]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [None]:
import subprocess
import os


# Install caffeinate package
%pip install caffeinate

# Use macOS built-in caffeinate command for reliability
# This prevents the system from sleeping while the process is running
try:
    # Start caffeinate in the background
    caff_process = subprocess.Popen(['caffeinate', '-d'], 
                                   stdout=subprocess.DEVNULL, 
                                   stderr=subprocess.DEVNULL)
    print(f"Caffeinate mode ON ☕ – Device will stay awake (PID: {caff_process.pid})")
    print("System sleep is disabled while this process runs.")
    
    # Store the process ID for later cleanup
    caff_pid = caff_process.pid
    
except Exception as e:
    print(f"⚠️ Could not start caffeinate: {e}")
    print("Continuing without caffeinate - system may sleep during long processes.")
    caff_process = None
    caff_pid = None

In [None]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [None]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
from edsl import QuestionFreeText, Scenario, Model, Survey
from textwrap import dedent
import json
import os
import pandas as pd
import numpy as np

def extract_task_sequence(occupation, tasks_data, output_data_path):
    """
    Extract task sequence for an occupation using EDSL workflow.
    Returns the ordered sequence of tasks.
    """
    # Check if output file already exists
    safe_title = occupation.replace(" ", "_").replace("/", "_")
    output_folder = f'{output_data_path}/tasks_sequences'
    output_file = os.path.join(output_folder, f"{safe_title}.csv")
    
    if os.path.exists(output_file):
        return output_file, True  # Return file path and flag indicating it already existed
    
    # Check if we have tasks for this occupation
    if tasks_data.empty:
        print(f"⚠️  Warning: No tasks found for occupation '{occupation}' - skipping")
        return None, True  # Treat as already processed to skip
    
    # Create task mappings
    task_id_mapping = dict(zip(tasks_data['Task Title'], tasks_data['Task ID']))
    soc_code_mapping = dict(zip(tasks_data['Task Title'], tasks_data['O*NET-SOC Code']))
    
    # Format tasks as numbered list
    tasks_list = tasks_data['Task Title'].tolist()
    tasks_text = "\n".join([f"{i}. {task}" for i, task in enumerate(tasks_list, 1)])
    num_tasks = len(tasks_list)
    max_tokens = 32000
    
    print(f"   • {num_tasks} tasks, using {max_tokens} max tokens")

    # Create scenario
    scenario = Scenario({
        "occupation": occupation,
        "tasks_list": tasks_text,
        "num_tasks": num_tasks
    })

    # Create question for task sequencing
    q_sequence = QuestionFreeText(
        question_name="task_sequence",
        question_text=dedent("""\
            You are an expert in workflow analysis for the occupation: {{ occupation }}.
            Below is a list of {{ num_tasks }} tasks that are part of this occupation:
            {{ tasks_list }}
            Provide the typical sequential order in which these tasks are performed in a real-world workflow.
            Return your answer as a JSON array where each element has:
            - "Task Position": the sequence number (1, 2, 3, etc.)
            - "Task Title": the exact task text from the list above
            Format: [{"Task Position": 1, "Task Title": "..."}, {"Task Position": 2, "Task Title": "..."}, ...]
            Only return the JSON array, nothing else.
        """)
    )

    try:
        # Create model using openai_v2 for reasoning capabilities
        model = Model("gpt-5-mini", service_name="openai_v2", temperature=0.0, max_tokens=max_tokens)
        
        # Run sequence question
        sequence_results = q_sequence.by(model).by([scenario]).run(progress_bar=False)
        sequence_df = sequence_results.to_pandas()
        sequence_json = sequence_df['answer.task_sequence'][0]
        
        # Debug: Print the raw response before cleaning
        print(f"   • Raw JSON length: {len(str(sequence_json))}")
        print(f"   • Raw JSON preview: {str(sequence_json)[:50]}...")
        
        # Clean the JSON response by removing markdown code blocks if present
        if isinstance(sequence_json, str):
            # Simple string replacement approach
            cleaned_json = sequence_json
            if '```json' in cleaned_json:
                cleaned_json = cleaned_json.replace('```json', '')
            if '```' in cleaned_json:
                cleaned_json = cleaned_json.replace('```', '')
            sequence_json = cleaned_json.strip()
            print(f"   • Cleaned JSON preview: {sequence_json[:50]}...")
        
        # Check if the response is valid
        if pd.isna(sequence_json) or not isinstance(sequence_json, str):
            print(f"❌ Error: Invalid response for '{occupation}' - got {type(sequence_json)} instead of string")
            return None, True  # Treat as already processed to skip
        
        # Try to parse JSON
        try:
            sequence_data = json.loads(sequence_json)
        except json.JSONDecodeError as e:
            print(f"❌ JSON parsing failed, trying to clean response further...")
            print(f"   Original error: {e}")
            print(f"   Response starts with: {sequence_json[:100]}...")
            # Try additional cleanup
            if sequence_json.startswith('```'):
                lines = sequence_json.split('\n')
                if lines[0].strip() in ['```', '```json']:
                    lines = lines[1:]  # Remove first line
                if lines[-1].strip() == '```':
                    lines = lines[:-1]  # Remove last line
                sequence_json = '\n'.join(lines).strip()
                print(f"   Cleaned response starts with: {sequence_json[:100]}...")
                try:
                    sequence_data = json.loads(sequence_json)
                    print(f"   ✅ Successfully parsed after additional cleanup")
                except json.JSONDecodeError as e2:
                    print(f"   ❌ Still failed after cleanup: {e2}")
                    raise e  # Re-raise original error
            else:
                raise e  # Re-raise original error
                
        ordered_sequence_df = pd.DataFrame(sequence_data)
        
        # Add metadata columns
        ordered_sequence_df['Occupation Title'] = occupation
        ordered_sequence_df['Task ID'] = ordered_sequence_df['Task Title'].map(task_id_mapping)
        ordered_sequence_df['O*NET-SOC Code'] = ordered_sequence_df['Task Title'].map(soc_code_mapping)
        
        # Reorder columns
        ordered_sequence_df = ordered_sequence_df[['Task Position', 'Task Title', 'Task ID', 'O*NET-SOC Code', 'Occupation Title']]

        # Save to file
        os.makedirs(output_folder, exist_ok=True)
        ordered_sequence_df.to_csv(output_file, index=False)
        
        print(f"   ✅ Successfully processed and saved task sequence")
        return output_file, False  # Return file path and flag indicating it was newly created
        
    except json.JSONDecodeError as e:
        print(f"❌ JSON Error for '{occupation}': {e}")
        print(f"   Raw response: {sequence_json}")
        return None, True  # Treat as already processed to skip
    except Exception as e:
        print(f"❌ Unexpected error for '{occupation}': {e}")
        return None, True  # Treat as already processed to skip


# NEW: extract sequence of occupations within a minor SOC group
def extract_occupation_sequence(minor_group_title, occupations_df, output_data_path):
    """
    Given a minor SOC group title (e.g., 'Chief Executives') and a DataFrame of occupations within it (with columns
    'Occupation Title' and 'O*NET-SOC Code'), query the model to provide a typical sequence of occupations
    (e.g., which occupations hand off work to which) within that minor group. Saves a CSV per minor group.

    Note: this prompt intentionally does NOT ask for O*NET-SOC Codes to avoid extra token costs; the CSV
    will contain Position and Occupation Title only. If you want to add SOC codes later, you can merge
    them locally from the ONET dataset.
    """
    safe_title = minor_group_title.replace(',', '')
    safe_title = safe_title.replace(' ', '_')
    output_folder = f'{output_data_path}/occupation_sequences'
    output_file = os.path.join(output_folder, f"{safe_title}.csv")

    if os.path.exists(output_file):
        return output_file, True

    if occupations_df.empty:
        print(f"⚠️  No occupations found for minor group '{minor_group_title}' - skipping")
        return None, True

    # Create task mappings
    minor_group_mapping = dict(zip(occupations_df['Minor_Group_Title'], occupations_df['Minor_Group_Code']))
    occupation_code_mapping = dict(zip(occupations_df['Occupation Title'], occupations_df['O*NET-SOC Code']))
    
    # build occupations list text
    occ_list = occupations_df[['Occupation Title']].drop_duplicates().reset_index(drop=True)
    occ_text = '\n'.join([f"{i}. {row['Occupation Title']}" for i, row in occ_list.iterrows()])
    num_occs = len(occ_list)

    print(f"   • {num_occs} occupations in minor group {minor_group_title}")

    scenario = Scenario({
        'minor_group': minor_group_title,
        'occupations_list': occ_text,
        'num_occs': num_occs
    })

    q_occ_sequence = QuestionFreeText(
        question_name='occupation_sequence',
        question_text=dedent("""\
            You are an expert on occupational workflows. Below is a list of occupations within the same minor SOC group ({{ minor_group }}):
            {{ occupations_list }}
            Provide the typical sequential order in which these occupations interact or hand off work in a multi-step workflow that involves multiple occupations within this group.
            Return your answer as a JSON array where each element has:
            - "Occupation Position": the sequence number (1, 2, 3, ...)
            - "Occupation Title": the exact occupation title from the list above
            Format: [{"Occupation Position": 1, "Occupation Title": "..."}, ...]
            Only return the JSON array, nothing else.
        """)
    )

    try:
        model = Model("gpt-5-mini", service_name="openai_v2", temperature=0.0, max_tokens=16000)
        res = q_occ_sequence.by(model).by([scenario]).run(progress_bar=False)
        res_df = res.to_pandas()
        res_json = res_df['answer.occupation_sequence'][0]

        if isinstance(res_json, str):
            cleaned = res_json
            if '```json' in cleaned:
                cleaned = cleaned.replace('```json', '')
            if '```' in cleaned:
                cleaned = cleaned.replace('```', '')
            res_json = cleaned.strip()

        seq_data = json.loads(res_json)
        seq_df = pd.DataFrame(seq_data)

        # Keep only Occupation Position and Occupation Title to save tokens; user can merge SOC codes locally later
        if 'Occupation Position' not in seq_df.columns or 'Occupation Title' not in seq_df.columns:
            print('⚠️  Unexpected response format; saving raw parsed dataframe for inspection')

        # Add metadata columns
        seq_df['Minor_Group_Title'] = minor_group_title
        seq_df['Minor_Group_Code'] = seq_df['Minor_Group_Title'].map(minor_group_mapping)
        seq_df['O*NET-SOC Code'] = seq_df['Occupation Title'].map(occupation_code_mapping)
        
        # Reorder columns
        seq_df = seq_df[['Occupation Position', 'Occupation Title', 'O*NET-SOC Code', 'Minor_Group_Code', 'Minor_Group_Code']]

        # Save output
        os.makedirs(output_folder, exist_ok=True)
        seq_df.to_csv(output_file, index=False)
        print(f"   ✅ Saved occupation sequence for {minor_group_title} -> {output_file}")
        return output_file, False
    except Exception as e:
        print(f"❌ Failed to extract occupation sequence for {minor_group_title}: {e}")
        return None, True

In [None]:
# # Load O*NET data and extract unique occupation titles
# ONET = pd.read_csv(f'{output_data_path}/ONET_cleaned_tasks.csv')

# # Get all unique occupation titles from the dataset
# occupations_list = sorted(ONET['Occupation Title'].unique().tolist())
# print(f"Found {len(occupations_list)} unique occupations in the dataset:")

# # Set seed for reproducible random sampling
# random.seed(42)
# np.random.seed(42)

# # # Randomly sample 10% of occupations
# # sample_size = max(1, int(len(occupations_list) * 0.10))  # Ensure at least 1 occupation
# # sampled_occupations = random.sample(occupations_list, sample_size)
# # print(f"Randomly selected {len(sampled_occupations)} occupations (5% of total) for processing:")
# # print(f"Sample: {sampled_occupations[:5]}..." if len(sampled_occupations) > 5 else f"Sample: {sampled_occupations}")
# sampled_occupations = occupations_list

# # Process each occupation
# processed_count = 0
# skipped_count = 0
# error_count = 0



# for i, occupation in enumerate(sampled_occupations, 1):
#     # Filter data for this occupation
#     occupation_data = ONET[ONET['Occupation Title'] == occupation].copy()
    
#     # Prepare task data
#     occupation_task_data = occupation_data[['Task ID', 'Task Title', 'O*NET-SOC Code']].drop_duplicates().reset_index(drop=True)
    
#     # Enhanced progress output
#     num_tasks = len(occupation_task_data)
#     print(f"\n[{i}/{len(sampled_occupations)}] {occupation}")
    
#     # Extract task sequence
#     output_file, already_existed = extract_task_sequence(occupation, occupation_task_data, output_data_path)
    
#     if output_file is None:
#         error_count += 1
#     elif already_existed:
#         print(f"   ⏭️  Already exists - skipping")
#         skipped_count += 1
#     else:
#         processed_count += 1

# # Summary
# print(f"\n" + "="*50)
# print(f"PROCESSING COMPLETE")
# print(f"="*50)
# print(f"• {processed_count} occupations processed")
# print(f"• {skipped_count} occupations skipped (already existed)")
# print(f"• {error_count} occupations failed")
# print(f"• {len(sampled_occupations)} total occupations in sample")

In [None]:
# Load O*NET data and extract unique occupations and minor groupings
ONET = pd.read_csv(f'{output_data_path}/ONET_cleaned_tasks.csv')

# Get unique minor groups
minor_groups = ONET['Minor_Group_Title'].unique().tolist()
minor_groups = [x for x in minor_groups if pd.notna(x)] # Drop "nan" instances
print(f"Found {len(minor_groups)} minor SOC groups in the dataset:")

# Process each minor group
processed = 0
skipped = 0
errors = 0

for i, minor in enumerate(minor_groups, 1):
    occ_df = ONET[ONET['Minor_Group_Title'] == minor][['Occupation Title','O*NET-SOC Code', 'Minor_Group_Code', 'Minor_Group_Title']].drop_duplicates().reset_index(drop=True)
    print(f"\n[{i}/{len(minor_groups)}] Minor group: {minor} (occupations: {len(occ_df)})")
    out_file, existed = extract_occupation_sequence(minor, occ_df, output_data_path)
    if out_file is None:
        errors += 1
    elif existed:
        print('   ⏭️ Already exists - skipping')
        skipped += 1
    else:
        processed += 1

# Summary
print('\n' + '='*50)
print('PROCESSING COMPLETE')
print('='*50)
print(f'• {processed} minor groups processed')
print(f'• {skipped} minor groups skipped (already existed)')
print(f'• {errors} minor groups failed')
print(f'• {len(minor_groups)} total minor groups')

In [None]:
# Clean up caffeinate process
try:
    if 'caff_process' in globals() and caff_process is not None:
        caff_process.terminate()
        caff_process.wait()  # Wait for process to terminate
        print("Caffeinate mode OFF 💡 - System sleep is now enabled.")
    else:
        print("Caffeinate was not running or already stopped.")
except Exception as e:
    print(f"Note: {e}")
    print("Caffeinate process may have already ended.")