#### By: Peyman Shahidi
#### Created: Jan 23, 2026
#### Last Edit: Jan 30, 2026

<br>

In [1]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [2]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data/computed_objects"
output_data_path = f'{main_folder_path}/data/computed_objects'

In [3]:
# Create directories if they don't exist
import os

for path in [output_data_path]:
    if not os.path.exists(path):
        os.makedirs(path)

### Main Code

In [4]:
ONET = pd.read_csv(f'{input_data_path}/ONET_cleaned_tasks.csv')

In [5]:
# Create crosswalk between O*NET-SOC Code and Detailed_Occupation_Code
crosswalk = ONET[["O*NET-SOC Code", "Occupation Title", "Detailed_Occupation_Code", "Detailed_Occupation_Title"]].drop_duplicates()
crosswalk = crosswalk.sort_values(["Detailed_Occupation_Code", "O*NET-SOC Code"]).reset_index(drop=True)
crosswalk.head(5)

Unnamed: 0,O*NET-SOC Code,Occupation Title,Detailed_Occupation_Code,Detailed_Occupation_Title
0,11-1011.00,Chief Executives,11-1011,Chief Executives
1,11-1011.03,Chief Sustainability Officers,11-1011,Chief Executives
2,11-1021.00,General and Operations Managers,11-1021,General and Operations Managers
3,11-2011.00,Advertising and Promotions Managers,11-2011,Advertising and Promotions Managers
4,11-2021.00,Marketing Managers,11-2021,Marketing Managers


In [6]:
# Subset to Detailed_Occupation_Codes with more than one O*NET-SOC Code
codes_with_multiple = crosswalk.groupby(["Detailed_Occupation_Code", "Detailed_Occupation_Title"]).filter(lambda x: len(x) > 1)
codes_with_multiple = codes_with_multiple.sort_values(["Detailed_Occupation_Code", "O*NET-SOC Code"]).reset_index(drop=True)
display(codes_with_multiple.head(5))

# Save to CSV
codes_with_multiple.to_csv(f"{output_data_path}/repetitive_onet_detailedOcc_occ_crosswalk.csv", index=False)

Unnamed: 0,O*NET-SOC Code,Occupation Title,Detailed_Occupation_Code,Detailed_Occupation_Title
0,11-1011.00,Chief Executives,11-1011,Chief Executives
1,11-1011.03,Chief Sustainability Officers,11-1011,Chief Executives
2,11-3031.00,Financial Managers,11-3031,Financial Managers
3,11-3031.01,Treasurers and Controllers,11-3031,Financial Managers
4,11-3031.03,Investment Fund Managers,11-3031,Financial Managers


In [7]:
# keep rows in crosswalk whose Detailed_Occupation_Code is NOT in codes_with_multiple
unique_subset = crosswalk[
    ~crosswalk["Detailed_Occupation_Code"].isin(
        codes_with_multiple["Detailed_Occupation_Code"]
    )
]
display(unique_subset.head(5))

# Save to CSV
unique_subset.to_csv(f"{output_data_path}/unique_onet_detailedOcc_occ_crosswalk.csv", index=False)

Unnamed: 0,O*NET-SOC Code,Occupation Title,Detailed_Occupation_Code,Detailed_Occupation_Title
2,11-1021.00,General and Operations Managers,11-1021,General and Operations Managers
3,11-2011.00,Advertising and Promotions Managers,11-2011,Advertising and Promotions Managers
4,11-2021.00,Marketing Managers,11-2021,Marketing Managers
5,11-2022.00,Sales Managers,11-2022,Sales Managers
6,11-3012.00,Administrative Services Managers,11-3012,Administrative Services Managers


In [8]:
# Create safe titles for both Occupation Title and Detailed_Occupation_Title columns
unique_subset["Safe_Occupation_Title"] = (
    unique_subset["Occupation Title"]
    .str.replace(" ", "_", regex=False)
    .str.replace("/", "_", regex=False)
)

unique_subset["Safe_Detailed_Occupation_Title"] = (
    unique_subset["Detailed_Occupation_Title"]
    .str.replace(" ", "_", regex=False)
    .str.replace("/", "_", regex=False)
)

display(unique_subset.head(5))


# Print the subset where the Safe_Occupation_Title is different from Safe_Detailed_Occupation_Title
diff_titles = unique_subset[
    unique_subset["Safe_Occupation_Title"] != unique_subset["Safe_Detailed_Occupation_Title"]
]
display(diff_titles)

Unnamed: 0,O*NET-SOC Code,Occupation Title,Detailed_Occupation_Code,Detailed_Occupation_Title,Safe_Occupation_Title,Safe_Detailed_Occupation_Title
2,11-1021.00,General and Operations Managers,11-1021,General and Operations Managers,General_and_Operations_Managers,General_and_Operations_Managers
3,11-2011.00,Advertising and Promotions Managers,11-2011,Advertising and Promotions Managers,Advertising_and_Promotions_Managers,Advertising_and_Promotions_Managers
4,11-2021.00,Marketing Managers,11-2021,Marketing Managers,Marketing_Managers,Marketing_Managers
5,11-2022.00,Sales Managers,11-2022,Sales Managers,Sales_Managers,Sales_Managers
6,11-3012.00,Administrative Services Managers,11-3012,Administrative Services Managers,Administrative_Services_Managers,Administrative_Services_Managers


Unnamed: 0,O*NET-SOC Code,Occupation Title,Detailed_Occupation_Code,Detailed_Occupation_Title,Safe_Occupation_Title,Safe_Detailed_Occupation_Title
108,15-1255.01,Video Game Designers,15-1255,Web and Digital Interface Designers,Video_Game_Designers,Web_and_Digital_Interface_Designers
121,15-2099.01,Bioinformatics Technicians,15-2099,"Mathematical Science Occupations, All Other",Bioinformatics_Technicians,"Mathematical_Science_Occupations,_All_Other"
204,19-2099.01,Remote Sensing Scientists and Technologists,19-2099,"Physical Scientists, All Other",Remote_Sensing_Scientists_and_Technologists,"Physical_Scientists,_All_Other"
217,19-3099.01,Transportation Planners,19-3099,"Social Scientists and Related Workers, All Other",Transportation_Planners,"Social_Scientists_and_Related_Workers,_All_Other"
300,25-2059.01,Adapted Physical Education Specialists,25-2059,"Special Education Teachers, All Other",Adapted_Physical_Education_Specialists,"Special_Education_Teachers,_All_Other"
427,29-9099.01,Midwives,29-9099,Healthcare Practitioners and Technical Workers...,Midwives,Healthcare_Practitioners_and_Technical_Workers...
470,33-9099.02,Retail Loss Prevention Specialists,33-9099,"Protective Service Workers, All Other",Retail_Loss_Prevention_Specialists,"Protective_Service_Workers,_All_Other"
656,47-4099.03,Weatherization Installers and Technicians,47-4099,"Construction and Related Workers, All Other",Weatherization_Installers_and_Technicians,"Construction_and_Related_Workers,_All_Other"
718,49-9099.01,Geothermal Technicians,49-9099,"Installation, Maintenance, and Repair Workers,...",Geothermal_Technicians,"Installation,_Maintenance,_and_Repair_Workers,..."
792,51-8099.01,Biofuels Processing Technicians,51-8099,"Plant and System Operators, All Other",Biofuels_Processing_Technicians,"Plant_and_System_Operators,_All_Other"


## Merge contents in a "safe" manner
#### 1) for cases where more than one "Occupation Title" is associated with the "Detailed_Occupation_Code" copy the contents of the "_repetitive" folder (these are the results of the rerun on Occupation Titles for such cases)
#### 2) for cases where a single "Occupation Title" is associated with the "Detailed_Occupation_Code" copy the contents of the "_tasks_sequences_robustness_detailedOccCode" folder, but change the name of the folders form the safe title created from "Detailed_Occupation_Code" to the safe title created by "Occupation Title" instead.

In [9]:
folder = f"{output_data_path}/tasks_sequences_robustness_detailedOccCode"
subfolders = [name for name in os.listdir(folder) if os.path.isdir(os.path.join(folder, name))]

# print number of all subfolders
print(f"Number of subfolders: {len(subfolders)}")

# print number of subfolders where the folder name (safe title of Detailed_Occupation_Title) 
# is associated with a single Occupation Title in the dataset above
subfolders_to_copy = [folder_name for folder_name in subfolders if folder_name in unique_subset["Safe_Detailed_Occupation_Title"].values]
print(f"Number of subfolders to copy: {len(subfolders_to_copy)}")

Number of subfolders: 743
Number of subfolders to copy: 692


In [10]:
import re
import shutil

source_root_unique = f"{output_data_path}/tasks_sequences_robustness_detailedOccCode"
source_root_repetitive = f"{output_data_path}/tasks_sequences_robustness_repetitive"
dest_root = f"{output_data_path}/tasks_sequences_robustness"

# Do it only if the output folder does not exist
if not os.path.exists(dest_root):
    os.makedirs(dest_root, exist_ok=True)
    # 1) copy all repetitive folders as-is
    for folder_name in os.listdir(source_root_repetitive):
        src = os.path.join(source_root_repetitive, folder_name)
        dst = os.path.join(dest_root, folder_name)
        if os.path.isdir(src):
            shutil.copytree(src, dst, dirs_exist_ok=True)

    # 2) copy unique folders, renaming from safe Detailed -> safe Occupation
    suffix_re = re.compile(r"^(?P<prefix>.*)_(?P<num>\d{1,2})\.csv$", re.IGNORECASE)
    
    # build mapping dict from your dataframe of unique cases
    # expects columns: Safe_Detailed_Occupation_Title, Safe_Occupation_Title, Occupation Title
    rename_map = dict(
        zip(
            unique_subset["Safe_Detailed_Occupation_Title"],
            unique_subset["Safe_Occupation_Title"]
        )
    )

    original_occ_code_map = dict(
        zip(
            unique_subset["Safe_Detailed_Occupation_Title"],
            unique_subset["O*NET-SOC Code"]
        )
    )

    original_occ_title_map = dict(
        zip(
            unique_subset["Safe_Detailed_Occupation_Title"],
            unique_subset["Occupation Title"]
        )
    )

    for detailed_safe, occupation_safe in rename_map.items():
        src_dir = os.path.join(source_root_unique, detailed_safe)
        dst_dir = os.path.join(dest_root, occupation_safe)

        if not os.path.isdir(src_dir):
            continue

        os.makedirs(dst_dir, exist_ok=True)

        for fname in os.listdir(src_dir):
            m = suffix_re.match(fname)
            if not m:
                continue  # skip non-matching files

            num = m.group("num")
            src_path = os.path.join(src_dir, fname)

            # destination file name: Safe_Occupation_Title_<num>.csv
            dst_fname = f"{occupation_safe}_{num}.csv"
            dst_path = os.path.join(dst_dir, dst_fname)

            # read -> rename columns -> write
            df = pd.read_csv(src_path)

            df = df.drop(columns=["Detailed_Occupation_Code", "Detailed_Occupation_Title"], errors='ignore')
            df["O*NET-SOC Code"] = original_occ_code_map[detailed_safe]
            df['Occupation Title'] = original_occ_title_map[detailed_safe]

            df.to_csv(dst_path, index=False)


# Sanity check #1: all contents of the dest_root folder must be consistent with the safe Occupation Titles in the full crosswalk
all_safe_occupation_titles = (
    crosswalk["Occupation Title"]
    .str.replace(" ", "_", regex=False)
    .str.replace("/", "_", regex=False)
).tolist()

dest_subfolders = [name for name in os.listdir(dest_root) if os.path.isdir(os.path.join(dest_root, name))]
print("Length of destination subfolders:", len(dest_subfolders))
for folder_name in dest_subfolders:
    assert folder_name in all_safe_occupation_titles, f"Folder {folder_name} not found in safe Occupation Titles!"


# Sanity check #2: the contents of each folder must be named consistently with the folder name
for folder_name in dest_subfolders:
    folder_path = os.path.join(dest_root, folder_name)
    for fname in os.listdir(folder_path):
        if not fname.lower().endswith(".csv"):
            continue

        base = fname[:-4]  # strip ".csv"
        if "_" not in base:
            continue  # unexpected, skip

        prefix = base[: base.rfind("_")]  # up to but not including last "_"
        assert (
            prefix == folder_name
        ), f"File {fname} in folder {folder_name} has inconsistent prefix {prefix}!"

Length of destination subfolders: 869


#### Create a "restructured" folder for each set of prompt outputs resembling the original "tasks_sequences" folder for consistency with later analysis scripts

In [11]:
# Source and destination paths
source_path = f"{main_folder_path}/data/computed_objects/tasks_sequences_robustness"
dest_path = f"{main_folder_path}/data/computed_objects/tasks_sequences_robustness_restructured"

# Create the destination folder if it doesn't exist
if not os.path.exists(dest_path):
    os.makedirs(dest_path)

# Create 11 prompt folders (prompt_0 to prompt_10)
for i in range(11):
    prompt_folder = os.path.join(dest_path, f"prompt_{i}")
    if not os.path.exists(prompt_folder):
        os.makedirs(prompt_folder)

# Iterate through each occupation folder in the source
for occupation_folder in os.listdir(source_path):
    occupation_path = os.path.join(source_path, occupation_folder)
    
    # Skip if not a directory
    if not os.path.isdir(occupation_path):
        continue
    
    # Iterate through files in the occupation folder
    for filename in os.listdir(occupation_path):
        if filename.endswith('.csv'):
            # Extract the suffix number (e.g., "_5.csv" -> 5)
            for i in range(11):
                if filename.endswith(f"_{i}.csv"):
                    # Copy file to the corresponding prompt folder
                    src_file = os.path.join(occupation_path, filename)
                    dest_folder = os.path.join(dest_path, f"prompt_{i}")
                    dest_file = os.path.join(dest_folder, filename)
                    shutil.copy2(src_file, dest_file)
                    break

print("Reorganization complete!")
print(f"\nDestination folder: {dest_path}")
print(f"\nContents of destination folder:")
for prompt_folder in sorted(os.listdir(dest_path)):
    prompt_path = os.path.join(dest_path, prompt_folder)
    if os.path.isdir(prompt_path):
        file_count = len([f for f in os.listdir(prompt_path) if f.endswith('.csv')])
        print(f"  {prompt_folder}: {file_count} files")

Reorganization complete!

Destination folder: ../data/computed_objects/tasks_sequences_robustness_restructured

Contents of destination folder:
  prompt_0: 869 files
  prompt_1: 868 files
  prompt_10: 786 files
  prompt_2: 869 files
  prompt_3: 868 files
  prompt_4: 868 files
  prompt_5: 868 files
  prompt_6: 866 files
  prompt_7: 868 files
  prompt_8: 867 files
  prompt_9: 868 files
