In [1]:
import glob
import os
import pandas as pd

In [2]:
BASE_DIR = "./similarity_matching_output"

In [3]:
def process_file(path: str) -> None:
    df = pd.read_csv(path)

    if "job_id" not in df.columns:
        raise ValueError(f"{path}: Missing required column 'job_id'.")

    agg = (
        df.groupby("job_id", as_index=False)
          .agg(
              role=("role", "first"),
              sim_min=("similarity", "min"),
              sim_max=("similarity", "max"),
              n=("job_id", "size")
          )
    )
    agg["sim_range"] = agg["sim_max"] - agg["sim_min"]

    base = os.path.splitext(os.path.basename(path))[0]
    base = base.split("top10_all_jobs_")[1]
    out_path = os.path.join(BASE_DIR, f"top_sim_range_by_job_{base}.csv")
    agg.to_csv(out_path, index=False)

    print(f"✓ {os.path.basename(path)} -> {os.path.basename(out_path)} ")

In [4]:
pattern = os.path.join(BASE_DIR, "top10_all_jobs_*.csv")
files = sorted(glob.glob(pattern))
if not files:
    raise FileNotFoundError(f"No files matched: {pattern}")

for f in files:
    process_file(f)

✓ top10_all_jobs_fair_lora.csv -> top_sim_range_by_job_fair_lora.csv 
✓ top10_all_jobs_jb_v2.csv -> top_sim_range_by_job_jb_v2.csv 
✓ top10_all_jobs_jb_v3.csv -> top_sim_range_by_job_jb_v3.csv 
✓ top10_all_jobs_miniLM_base.csv -> top_sim_range_by_job_miniLM_base.csv 
✓ top10_all_jobs_openai.csv -> top_sim_range_by_job_openai.csv 


In [5]:
PATTERN = os.path.join(BASE_DIR, "top_sim_range_by_job_*.csv")

In [6]:
def model_name_from_file(fname: str) -> str:
    base = os.path.basename(fname)
    base = base.replace("top10_all_jobs_", "")
    base = base.replace("top_sim_range_by_job_", "")
    base = base.replace(".csv", "")
    return base

def safe_idxmin(s: pd.Series):
    s2 = pd.to_numeric(s, errors="coerce")
    if s2.notna().any():
        return s2.idxmin()
    return None

def safe_idxmax(s: pd.Series):
    s2 = pd.to_numeric(s, errors="coerce")
    if s2.notna().any():
        return s2.idxmax()
    return None

def summarize_file(path: str) -> None:
    df = pd.read_csv(path)

    required = {"job_id", "role", "sim_range"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"{path}: missing columns {missing}. Found: {list(df.columns)}")

    df["sim_range"] = pd.to_numeric(df["sim_range"], errors="coerce")

    model = model_name_from_file(path)

    mean_range = df["sim_range"].mean()
    median_range = df["sim_range"].median()

    # Smallest / largest by job_id (global)
    i_min = safe_idxmin(df["sim_range"])
    i_max = safe_idxmax(df["sim_range"])

    min_job_id = df.loc[i_min, "job_id"] if i_min is not None else None
    min_range = df.loc[i_min, "sim_range"] if i_min is not None else None

    max_job_id = df.loc[i_max, "job_id"] if i_max is not None else None
    max_range = df.loc[i_max, "sim_range"] if i_max is not None else None

    # Smallest / largest by role (based on role-level mean sim_range)
    role_stats = (
        df.groupby("role", as_index=False)["sim_range"]
          .mean()
          .rename(columns={"sim_range": "mean_sim_range"})
    )

    r_min = safe_idxmin(role_stats["mean_sim_range"])
    r_max = safe_idxmax(role_stats["mean_sim_range"])

    min_role = role_stats.loc[r_min, "role"] if r_min is not None else None
    min_role_range = role_stats.loc[r_min, "mean_sim_range"] if r_min is not None else None

    max_role = role_stats.loc[r_max, "role"] if r_max is not None else None
    max_role_range = role_stats.loc[r_max, "mean_sim_range"] if r_max is not None else None

    print(f"\n=== Model: {model} ===")
    print(f"Mean sim_range:   {mean_range:.6f}" if pd.notna(mean_range) else "Mean sim_range:   NA")
    print(f"Median sim_range: {median_range:.6f}" if pd.notna(median_range) else "Median sim_range: NA")

    print(f"Job_id with smallest range: {min_job_id}")
    print(f"Smallest range:             {min_range:.6f}" if pd.notna(min_range) else "Smallest range:             NA")

    print(f"Job_id with largest range:  {max_job_id}")
    print(f"Largest range:              {max_range:.6f}" if pd.notna(max_range) else "Largest range:              NA")

    print(f"Role with smallest range:   {min_role}")
    print(f"Smallest role mean range:   {min_role_range:.6f}" if pd.notna(min_role_range) else "Smallest role mean range:   NA")

    print(f"Role with largest range:    {max_role}")
    print(f"Largest role mean range:    {max_role_range:.6f}" if pd.notna(max_role_range) else "Largest role mean range:    NA")

In [7]:
files = sorted(glob.glob(PATTERN))
if not files:
    raise FileNotFoundError(f"No output files matched: {PATTERN}")

for f in files:
    summarize_file(f)


=== Model: fair_lora ===
Mean sim_range:   0.012245
Median sim_range: 0.011833
Job_id with smallest range: JOB_132
Smallest range:             0.005041
Job_id with largest range:  JOB_87
Largest range:              0.023990
Role with smallest range:   Data Scientist
Smallest role mean range:   0.009643
Role with largest range:    Research Assistant
Largest role mean range:    0.016731

=== Model: jb_v2 ===
Mean sim_range:   0.023955
Median sim_range: 0.022322
Job_id with smallest range: JOB_84
Smallest range:             0.008197
Job_id with largest range:  JOB_25
Largest range:              0.070186
Role with smallest range:   Registered Nurse (ICU)
Smallest role mean range:   0.011403
Role with largest range:    Marketing Manager
Largest role mean range:    0.040794

=== Model: jb_v3 ===
Mean sim_range:   0.026097
Median sim_range: 0.024757
Job_id with smallest range: JOB_123
Smallest range:             0.004509
Job_id with largest range:  JOB_108
Largest range:              0.07038