#### By: Peyman Shahidi
#### Created: Dec 16, 2025
#### Last Edit: Dec 16, 2025

<br>

In [98]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [99]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data/computed_objects/tasks_sequences_robustness"
output_data_path = f'{input_data_path}/data/computed_objects/GPT_task_sequences_overlap_analysis'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [100]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

### Main Code

In [101]:
from pathlib import Path
import pandas as pd
import re

base_dir = Path(input_data_path)

dfs = []

for folder in base_dir.iterdir():
    if not folder.is_dir():
        continue

    occupation = folder.name

    for csv_file in folder.glob("*.csv"):
        # Expect: <occupation>_X.csv
        match = re.search(rf"{re.escape(occupation)}_(\d+)\.csv$", csv_file.name)
        if match is None:
            continue  # skip files that don't match the convention

        prompt_number = int(match.group(1))

        df = pd.read_csv(csv_file)
        df["prompt_number"] = prompt_number

        dfs.append(df)

# Combine everything
master_df = pd.concat(dfs, ignore_index=True)

# Save if you want
# master_df.to_csv("masterfile.csv", index=False)


In [102]:
from scipy.stats import kendalltau
import itertools
import pandas as pd


def compute_pairwise_kendall(df_occ):
    """
    df_occ: master_df filtered to ONE occupation
    returns: DataFrame with pairwise Kendall taus
    """

    # map: prompt_number -> {task_id: rank}
    rankings = {}

    for p, g in df_occ.groupby("prompt_number"):
        rankings[p] = (
            g.sort_values("Task Position")
             .set_index("Task ID")["Task Position"]
        )

    results = []

    for p1, p2 in itertools.combinations(rankings.keys(), 2):
        r1 = rankings[p1]
        r2 = rankings[p2]

        # align tasks (defensive)
        common_tasks = r1.index.intersection(r2.index)

        tau, _ = kendalltau(
            r1.loc[common_tasks],
            r2.loc[common_tasks]
        )

        results.append({
            "prompt_1": p1,
            "prompt_2": p2,
            "kendall_tau": tau
        })

    return pd.DataFrame(results)

In [103]:
all_results = []

for occ, df_occ in master_df.groupby("Detailed_Occupation_Title"):
    res = compute_pairwise_kendall(df_occ)
    res["occupation"] = occ
    all_results.append(res)

kendall_results = pd.concat(all_results, ignore_index=True)


In [104]:
summary = (
    kendall_results
    .groupby("occupation")["kendall_tau"]
    .agg(["mean", "min", "std"])
    .reset_index()
)

In [105]:
def precedence_set(ranking):
    tasks = ranking.index.tolist()
    return {
        (tasks[i], tasks[j])
        for i in range(len(tasks))
        for j in range(i+1, len(tasks))
    }


def precedence_overlap(r1, r2):
    P1 = precedence_set(r1)
    P2 = precedence_set(r2)
    return len(P1 & P2) / len(P1)


In [106]:
summary

Unnamed: 0,occupation,mean,min,std
0,Amusement and Recreation Attendants,0.51,-0.18,0.32
1,"Cleaning, Washing, and Metal Pickling Equipmen...",0.39,-0.02,0.31
2,Concierges,0.3,-0.22,0.23
3,Elevator and Escalator Installers and Repairers,0.62,0.21,0.21
4,"Extruding and Forming Machine Setters, Operato...",0.56,0.13,0.24
5,First-Line Supervisors of Housekeeping and Jan...,0.41,-0.12,0.28
6,"Service Unit Operators, Oil and Gas",0.6,0.15,0.22


In [107]:
import itertools
import numpy as np

def coloc_matrix(df_occ, k=1):
    tasks = df_occ["Task ID"].unique()
    prompts = df_occ["prompt_number"].unique()

    M = pd.DataFrame(0.0, index=tasks, columns=tasks)

    for p in prompts:
        g = df_occ[df_occ["prompt_number"] == p]
        pos = g.set_index("Task ID")["Task Position"]

        for i, j in itertools.combinations(tasks, 2):
            if abs(pos[i] - pos[j]) <= k:
                M.loc[i, j] += 1
                M.loc[j, i] += 1

    M /= len(prompts)
    np.fill_diagonal(M.values, 1.0)
    return M


In [108]:
def block_stability(df_occ, task_set):
    prompts = df_occ["prompt_number"].unique()
    count = 0

    for p in prompts:
        g = df_occ[df_occ["prompt_number"] == p].sort_values("Task Position")
        ordered = g["Task ID"].tolist()

        positions = [ordered.index(t) for t in task_set]
        if max(positions) - min(positions) == len(task_set) - 1:
            count += 1

    return count / len(prompts)


In [109]:
import networkx as nx

def adjacency_graph(df_occ):
    G = nx.Graph()
    prompts = df_occ["prompt_number"].unique()

    for p in prompts:
        g = df_occ[df_occ["prompt_number"] == p].sort_values("Task Position")
        tasks = g["Task ID"].tolist()

        for i in range(len(tasks)-1):
            u, v = tasks[i], tasks[i+1]
            G.add_edge(u, v, weight=G.get_edge_data(u, v, {"weight": 0})["weight"] + 1)

    for u, v, d in G.edges(data=True):
        d["weight"] /= len(prompts)

    return G


In [110]:
def benchmark_adjacencies(df_occ, benchmark_prompt=0):
    g = (
        df_occ[df_occ["prompt_number"] == benchmark_prompt]
        .sort_values("Task Position")
    )
    tasks = g["Task ID"].tolist()
    return set(zip(tasks[:-1], tasks[1:]))


def adjacency_retention(df_occ, benchmark_prompt=0):
    bench_adj = benchmark_adjacencies(df_occ, benchmark_prompt)
    prompts = df_occ["prompt_number"].unique()

    retained = []

    for p in prompts:
        if p == benchmark_prompt:
            continue

        g = (
            df_occ[df_occ["prompt_number"] == p]
            .sort_values("Task Position")
        )
        adj = set(zip(g["Task ID"][:-1], g["Task ID"][1:]))

        retained.append(len(bench_adj & adj) / len(bench_adj))

    return sum(retained) / len(retained)


In [111]:
import pandas as pd
import numpy as np

# --------------------------------------------------
# 1. Get benchmark adjacencies for one occupation
# --------------------------------------------------
def get_benchmark_adjacencies(df_occ, benchmark_prompt=0):
    g = (
        df_occ[df_occ["prompt_number"] == benchmark_prompt]
        .sort_values("Task Position")
    )

    tasks = g["Task ID"].tolist()
    return set(zip(tasks[:-1], tasks[1:]))


# --------------------------------------------------
# 2. Adjacency retention relative to benchmark
# --------------------------------------------------
def adjacency_retention(df_occ, benchmark_prompt=0):
    bench_adj = get_benchmark_adjacencies(df_occ, benchmark_prompt)
    if len(bench_adj) == 0:
        return np.nan

    prompts = sorted(df_occ["prompt_number"].unique())

    retention_rates = []

    for p in prompts:
        if p == benchmark_prompt:
            continue

        g = (
            df_occ[df_occ["prompt_number"] == p]
            .sort_values("Task Position")
        )

        adj = set(zip(g["Task ID"][:-1], g["Task ID"][1:]))

        retention_rates.append(
            len(bench_adj & adj) / len(bench_adj)
        )

    return np.mean(retention_rates)


# --------------------------------------------------
# 3. Run for all occupations
# --------------------------------------------------
results = []

for occ, df_occ in master_df.groupby("Detailed_Occupation_Title"):
    val = adjacency_retention(df_occ, benchmark_prompt=0)

    results.append({
        "Detailed_Occupation_Title": occ,
        "benchmark_adjacency_retention": val
    })

benchmark_results = pd.DataFrame(results)
benchmark_results

Unnamed: 0,Detailed_Occupation_Title,benchmark_adjacency_retention
0,Amusement and Recreation Attendants,0.35
1,"Cleaning, Washing, and Metal Pickling Equipmen...",0.5
2,Concierges,0.22
3,Elevator and Escalator Installers and Repairers,0.37
4,"Extruding and Forming Machine Setters, Operato...",0.32
5,First-Line Supervisors of Housekeeping and Jan...,0.19
6,"Service Unit Operators, Oil and Gas",0.37


In [112]:
import numpy as np

# --------------------------------------------------
# 1. Benchmark adjacencies (same as before)
# --------------------------------------------------
def get_benchmark_adjacencies(df_occ, benchmark_prompt=0):
    g = (
        df_occ[df_occ["prompt_number"] == benchmark_prompt]
        .sort_values("Task Position")
    )

    tasks = g["Task ID"].tolist()
    return list(zip(tasks[:-1], tasks[1:]))


# --------------------------------------------------
# 2. k-adjacency retention relative to benchmark
# --------------------------------------------------
def k_adjacency_retention(df_occ, benchmark_prompt=0, k=2):
    bench_adj = get_benchmark_adjacencies(df_occ, benchmark_prompt)
    if len(bench_adj) == 0:
        return np.nan

    prompts = sorted(df_occ["prompt_number"].unique())
    retention_rates = []

    for p in prompts:
        if p == benchmark_prompt:
            continue

        g = df_occ[df_occ["prompt_number"] == p]
        pos = g.set_index("Task ID")["Task Position"]

        retained = 0
        valid = 0

        for i, j in bench_adj:
            if i in pos.index and j in pos.index:
                valid += 1
                if abs(pos[i] - pos[j]) <= (k - 1):
                    retained += 1

        if valid > 0:
            retention_rates.append(retained / valid)

    return np.mean(retention_rates) if retention_rates else np.nan



# --------------------------------------------------
# 3. Run for all occupations
# --------------------------------------------------
results_k2 = []

for occ, df_occ in master_df.groupby("Detailed_Occupation_Title"):
    val = k_adjacency_retention(
        df_occ,
        benchmark_prompt=0,
        k=2
    )

    results_k2.append({
        "Detailed_Occupation_Title": occ,
        "benchmark_k2_adjacency_retention": val
    })

benchmark_k2_results = pd.DataFrame(results_k2)

final_results = benchmark_results.merge(
    benchmark_k2_results,
    on="Detailed_Occupation_Title"
)

final_results

Unnamed: 0,Detailed_Occupation_Title,benchmark_adjacency_retention,benchmark_k2_adjacency_retention
0,Amusement and Recreation Attendants,0.35,0.47
1,"Cleaning, Washing, and Metal Pickling Equipmen...",0.5,0.56
2,Concierges,0.22,0.33
3,Elevator and Escalator Installers and Repairers,0.37,0.42
4,"Extruding and Forming Machine Setters, Operato...",0.32,0.52
5,First-Line Supervisors of Housekeeping and Jan...,0.19,0.32
6,"Service Unit Operators, Oil and Gas",0.37,0.52
