In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!rm -rf /content/multi-challenge
!git clone https://github.com/iemppu/multi-challenge /content/multi-challenge
!ls -la /content/multi-challenge


Cloning into '/content/multi-challenge'...
remote: Enumerating objects: 148, done.[K
remote: Counting objects: 100% (148/148), done.[K
remote: Compressing objects: 100% (134/134), done.[K
remote: Total 148 (delta 69), reused 58 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (148/148), 3.26 MiB | 7.21 MiB/s, done.
Resolving deltas: 100% (69/69), done.
total 40
drwxr-xr-x 5 root root 4096 Jan  4 00:39 .
drwxr-xr-x 1 root root 4096 Jan  4 00:39 ..
drwxr-xr-x 3 root root 4096 Jan  4 00:39 data
drwxr-xr-x 8 root root 4096 Jan  4 00:39 .git
-rw-r--r-- 1 root root 3853 Jan  4 00:39 main.py
-rw-r--r-- 1 root root 3968 Jan  4 00:39 README.md
-rw-r--r-- 1 root root  129 Jan  4 00:39 requirements.txt
-rw-r--r-- 1 root root 3923 Jan  4 00:39 run_judge_eval.py
-rw-r--r-- 1 root root 2211 Jan  4 00:39 run_slsm_multichallenge_gpt4o.py
drwxr-xr-x 3 root root 4096 Jan  4 00:39 src


In [None]:
%cd /content/multi-challenge
!pip install -r requirements.txt

/content/multi-challenge


In [None]:
import os
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


#Implementation of SLSM

In [None]:
from src.data_loader import DataLoader

# 路径与你 repo 对齐
BENCHMARK = "data/benchmark_questions.jsonl"

dl = DataLoader(input_file=BENCHMARK)
dl.load_data()

conversations = dl.get_conversations()

print(f"Loaded {len(conversations)} conversations")
print(type(conversations[0]))


Loaded 273 conversations
<class 'src.conversation.Conversation'>


In [None]:
conv = conversations[0]
messages = conv.conversation   # list[{"role","content"}]

print(messages[0].keys())      # 应该是 dict with 'role', 'content'


dict_keys(['role', 'content'])


In [None]:
# =========================
# Print full conversation + SLSM sanity-check
# =========================

# ---- imports ----
from src.data_loader import DataLoader
from src.models.openai import OpenAIModel
from src.slsm_wrapper import (
    SLSMConfig,
    SLSMController,
    SLSMWrapper,
)

# ---- load conversations ----
BENCHMARK = "data/benchmark_questions.jsonl"
dl = DataLoader(input_file=BENCHMARK)
dl.load_data()
conversations = dl.get_conversations()

# ---- pick one conversation ----
conv = conversations[0]
messages = conv.conversation

# ---- print full conversation (indexed) ----
print("=== FULL CONVERSATION (indexed) ===")
for i, m in enumerate(messages):
    role = m.get("role")
    content = (m.get("content") or "")
    # truncate long turns for readability
    preview = content if len(content) <= 500 else content[:500] + " ...[truncated]"
    print(f"\n--- turn {i} | {role} ---\n{preview}")

# ---- controller (cheap, fixed) ----
controller_llm = OpenAIModel(
    model="gpt-4o-mini",
    temp=0
)

cfg = SLSMConfig(
    inject="always",   # 临时强制注入
    note_max_items=6,
)

controller = SLSMController(controller_llm, cfg)
wrapper = SLSMWrapper(controller, cfg)

# ---- underlying model (tested model) ----
underlying_llm = OpenAIModel(
    model="gpt-4o-2024-08-06",
    temp=0
)

# ---- baseline ----
baseline_resp = underlying_llm.generate(messages)

# ---- SLSM wrapped ----
slsm_resp = wrapper.generate_last_turn(
    underlying_llm=underlying_llm,
    original_conversation=messages,
)

print("\n=== BASELINE ===")
print(baseline_resp[:500])
print("\n=== SLSM ===")
print(slsm_resp[:500])

# ---- inspect state + injected note ----
state = wrapper.track_state(messages)
print("\n=== RAW STATE FACTS (controller output, pre-gating) ===")
print(state.facts)

msgs = wrapper.build_final_messages(messages, state)
print("\n=== FINAL MSG ROLES (first 6) ===")
print([m["role"] for m in msgs[:6]])

print("\n=== INJECTED NOTE (first 1200 chars) ===")
print(msgs[0]["role"], msgs[0]["content"][:1200])


=== FULL CONVERSATION (indexed) ===

--- turn 0 | user ---
Hello!  I am an International relations expert working at the UN headquarters. My work requires me to consistently meet with diplomats from various countries. I hate using taxis or public transportation in New York. I prefer venues that are within a 5-minute walk from the UN headquarters.

--- turn 1 | assistant ---
Hello! It's great to hear that you're looking for places near the UN headquarters in New York. Here are a few suggestions that are within a 5-minute walk:

1. Dag Hammarskjold Plaza: This public park is just across the street from the UN headquarters. It's a great place for a peaceful walk or a quick meeting.

2. The Delegates Dining Room: Located within the UN headquarters itself, this is a convenient place for a meeting over lunch or dinner.

3. The Roosevelt Hotel: This historic hotel is just ...[truncated]

--- turn 2 | user ---
 I am meeting a German diplomat on Friday. I am looking for a suitable place to have

In [None]:
# This cell will pull new code from remote to this runtine
%cd /content/multi-challenge
!git status
!git pull

/content/multi-challenge
On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   run_slsm_multichallenge_gpt4o.py[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31msrc/__pycache__/[m
	[31msrc/models/__pycache__/[m

no changes added to commit (use "git add" and/or "git commit -a")
Already up to date.


#Run and test with gpt-4o-mini-controlled response by gpt-4o

In [None]:
!python run_slsm_multichallenge_gpt4o.py

Loaded 273 conversations
Running SLSM-controlled GPT-4o: 100% 50/50 [1:02:56<00:00, 75.52s/it]

Done. Results saved to:
  data/final_model_responses/gpt-4o-2024-08-06_slsm-gpt-4o-mini.jsonl


In [None]:
import json

src = "data/final_model_responses/gpt-4o-2024-08-06_responses.jsonl"
dst = "data/final_model_responses/gpt-4o-2024-08-06_responses_first50.jsonl"

with open(src, "r", encoding="utf-8") as f:
    lines = [next(f) for _ in range(50)]

with open(dst, "w", encoding="utf-8") as g:
    for line in lines:
        g.write(line)

print("Wrote:", dst, "lines=", len(lines))

Wrote: data/final_model_responses/gpt-4o-2024-08-06_responses_first50.jsonl lines= 50


In [None]:
a = "data/final_model_responses/gpt-4o-2024-08-06_responses_first50.jsonl"
b = "data/final_model_responses/gpt-4o-2024-08-06_slsm-gpt-4o-mini.jsonl"

def ids(path):
    out=[]
    with open(path,"r",encoding="utf-8") as f:
        for line in f:
            out.append(json.loads(line)["question_id"])
    return out

A=ids(a); B=ids(b)
print("baseline lines:", len(A), "unique:", len(set(A)))
print("slsm lines:", len(B), "unique:", len(set(B)))
print("same order:", A==B)
print("same set:", set(A)==set(B))

KeyError: 'question_id'

In [None]:
# =========================
# Run judge-based evaluation on first 50 samples
# Baseline vs SLSM-controlled GPT-4o
# =========================

%cd /content/multi-challenge
!mkdir -p outputs_first50

# ---- Baseline: GPT-4o (first 50) ----
!python -m run_judge_eval \
  --responses data/final_model_responses/gpt-4o-2024-08-06_responses_first50.jsonl \
  --out_json outputs_first50/gpt4o_baseline_judge_results.json \
  --out_csv outputs_first50/gpt4o_baseline_judge_results.csv \
  --workers 1 \
  --attempts 1


print("Done 1. Judge evaluation results saved to outputs_first50/")


/content/multi-challenge
Evaluating responses: 100% 50/50 [01:22<00:00,  1.66s/it]

=== SCORES ===
{
  "overall_score": 0.5835577786328074,
  "axis_scores": {
    "INFERENCE_MEMORY": 0.8849557522123894,
    "RELIABLE_VERSION_EDITING": 0.0,
    "SELF_COHERENCE": 0.0,
    "INSTRUCTION_RETENTION": 1.4492753623188406
  }
}

Saved: outputs_first50/gpt4o_baseline_judge_results.json
Saved: outputs_first50/gpt4o_baseline_judge_results.csv
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/content/multi-challenge/run_judge_eval.py", line 101, in <module>
    main()
  File "/content/multi-challenge/run_judge_eval.py", line 81, in main
    responses = load_responses_jsonl(args.responses)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/multi-challenge/run_judge_eval.py", line 55, in load_responses_jsonl
    qid = obj["QUESTION_ID"]
          ~~~^^^^^^^^^^^^^^^
KeyError: 'QUESTION

In [None]:
# =========================
# Fix SLSM response JSONL format for run_judge_eval (QUESTION_ID key)
# then run judge eval
# =========================

%cd /content/multi-challenge
!mkdir -p outputs_first50

import json

src = "data/final_model_responses/gpt-4o-2024-08-06_slsm-gpt-4o-mini.jsonl"
dst = "data/final_model_responses/gpt-4o-2024-08-06_slsm-gpt-4o-mini_mcformat.jsonl"

n = 0
with open(src, "r", encoding="utf-8") as f, open(dst, "w", encoding="utf-8") as g:
    for line in f:
        obj = json.loads(line)

        # ---- normalize keys ----
        # qid
        qid = obj.get("QUESTION_ID", obj.get("question_id", obj.get("qid")))
        if qid is None:
            raise KeyError(f"Missing question id in line: {obj.keys()}")

        # response
        resp = obj.get("RESPONSE", obj.get("response", obj.get("answer")))
        if resp is None:
            raise KeyError(f"Missing response text in line: {obj.keys()}")

        # model name (optional)
        model = obj.get("MODEL", obj.get("model", "UNKNOWN_MODEL"))

        out = {
            "QUESTION_ID": qid,
            "MODEL": model,
            "RESPONSE": resp,
        }
        g.write(json.dumps(out, ensure_ascii=False) + "\n")
        n += 1

print(f"Wrote {n} lines -> {dst}")

# ---- now run judge eval on SLSM file (fixed format) ----
!python -m run_judge_eval \
  --responses data/final_model_responses/gpt-4o-2024-08-06_slsm-gpt-4o-mini_mcformat.jsonl \
  --out_json outputs_first50/gpt4o_slsm_judge_results.json \
  --out_csv outputs_first50/gpt4o_slsm_judge_results.csv \
  --workers 1 \
  --attempts 1

print("Done. SLSM judge results saved to outputs_first50/")


/content/multi-challenge
Wrote 50 lines -> data/final_model_responses/gpt-4o-2024-08-06_slsm-gpt-4o-mini_mcformat.jsonl
Evaluating responses: 100% 50/50 [01:33<00:00,  1.86s/it]

=== SCORES ===
{
  "overall_score": 3.7574708221110686,
  "axis_scores": {
    "INFERENCE_MEMORY": 0.8849557522123894,
    "RELIABLE_VERSION_EDITING": 0.0,
    "SELF_COHERENCE": 4.0,
    "INSTRUCTION_RETENTION": 10.144927536231885
  }
}

Saved: outputs_first50/gpt4o_slsm_judge_results.json
Saved: outputs_first50/gpt4o_slsm_judge_results.csv
Done. SLSM judge results saved to outputs_first50/


In [None]:
# =========================
# Compare judge CSV: Baseline vs SLSM (first 50)
# Produces: summary CSV + LaTeX table + win-rate stats
# =========================

%cd /content/multi-challenge

import pandas as pd
import numpy as np
from pathlib import Path

# ---- configure paths ----
BASE_CSV = Path("outputs_first50/gpt4o_baseline_judge_results.csv")
SLSM_CSV = Path("outputs_first50/gpt4o_slsm_judge_results.csv")

assert BASE_CSV.exists(), f"Missing: {BASE_CSV}"
assert SLSM_CSV.exists(), f"Missing: {SLSM_CSV}"

base = pd.read_csv(BASE_CSV)
slsm = pd.read_csv(SLSM_CSV)

print("Baseline CSV:", BASE_CSV, "| rows:", len(base), "| cols:", len(base.columns))
print("SLSM CSV    :", SLSM_CSV, "| rows:", len(slsm), "| cols:", len(slsm.columns))

# ---- find ID column (judge_eval expects QUESTION_ID usually) ----
id_candidates = ["QUESTION_ID", "question_id", "qid", "id"]
id_col = None
for c in id_candidates:
    if c in base.columns and c in slsm.columns:
        id_col = c
        break
if id_col is None:
    # fallback: any common column containing 'id'
    commons = set(base.columns) & set(slsm.columns)
    for c in commons:
        if "id" in c.lower():
            id_col = c
            break
assert id_col is not None, f"Cannot find common ID column. baseline cols={list(base.columns)}"

# ---- merge and suffix ----
df = base.merge(slsm, on=id_col, how="inner", suffixes=("_base", "_slsm"))
assert len(df) > 0, "Merged dataframe is empty. IDs between baseline and SLSM don't match."
print("ID column:", id_col, "| merged rows:", len(df))

# ---- detect metric pairs (numeric columns that exist in both with suffixes) ----
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [c for c in num_cols if c != id_col]

pairs = []
for c in num_cols:
    if c.endswith("_base"):
        c2 = c[:-5] + "_slsm"
        if c2 in df.columns:
            pairs.append((c, c2))

assert pairs, f"No paired metrics found. Numeric cols = {num_cols[:30]}"

# ---- bootstrap CI helper ----
rng = np.random.default_rng(0)

def bootstrap_ci_mean_diff(x, y, n_boot=5000, alpha=0.05):
    """
    Returns (mean_diff, ci_low, ci_high) for (y-x) using bootstrap over paired samples.
    """
    diffs = (y - x).astype(float)
    n = diffs.shape[0]
    idx = rng.integers(0, n, size=(n_boot, n))
    boot_means = diffs[idx].mean(axis=1)
    lo = np.quantile(boot_means, alpha/2)
    hi = np.quantile(boot_means, 1 - alpha/2)
    return float(diffs.mean()), float(lo), float(hi)

rows = []
for b, s in pairs:
    metric = b[:-5]  # remove "_base"
    x = df[b].to_numpy()
    y = df[s].to_numpy()

    base_mean = float(np.mean(x))
    slsm_mean = float(np.mean(y))
    dmean, dlo, dhi = bootstrap_ci_mean_diff(x, y)

    win = float(np.mean(y > x))
    tie = float(np.mean(y == x))
    lose = float(np.mean(y < x))

    rows.append({
        "metric": metric,
        "baseline_mean": base_mean,
        "slsm_mean": slsm_mean,
        "delta_mean": dmean,
        "delta_ci_low": dlo,
        "delta_ci_high": dhi,
        "win_rate": win,
        "tie_rate": tie,
        "lose_rate": lose,
        "n": int(len(x)),
    })

summary = pd.DataFrame(rows).sort_values("metric").reset_index(drop=True)

# ---- pretty print ----
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)
print("\n=== METRIC SUMMARY (means / Δ / CI / win-rate) ===")
display(summary)

# ---- build LaTeX (booktabs) ----
def fmt(x, nd=3):
    return f"{x:.{nd}f}"

latex = []
latex += [r"\begin{table}[t]",
          r"\centering",
          r"\small",
          r"\begin{tabular}{lrrrr}",
          r"\toprule",
          r"Metric & Baseline & SLSM & $\Delta$ (boot 95\% CI) & Win-rate \\",
          r"\midrule"]

for _, r in summary.iterrows():
    metric = r["metric"]
    base_m = fmt(r["baseline_mean"])
    slsm_m = fmt(r["slsm_mean"])
    d = fmt(r["delta_mean"])
    lo = fmt(r["delta_ci_low"])
    hi = fmt(r["delta_ci_high"])
    win = f"{100*r['win_rate']:.1f}" + r"\%"
    latex.append(f"{metric} & {base_m} & {slsm_m} & {d} [{lo}, {hi}] & {win} \\\\")

latex += [r"\bottomrule",
          r"\end{tabular}",
          r"\caption{Judge scores on the first 50 Multi-Challenge samples: baseline GPT-4o vs SLSM-controlled GPT-4o (controller: GPT-4o-mini). $\Delta$ denotes SLSM minus baseline with paired bootstrap 95\% confidence intervals; win-rate is the fraction of samples where SLSM scores strictly higher than baseline.}",
          r"\label{tab:mc_first50_judge}",
          r"\end{table}"]

latex = "\n".join(latex)

print("\n=== LaTeX TABLE (copy/paste) ===\n")
print(latex)

# ---- save artifacts ----
out_dir = Path("outputs_first50")
out_dir.mkdir(parents=True, exist_ok=True)

summary_path = out_dir / "baseline_vs_slsm_summary_first50.csv"
tex_path = out_dir / "baseline_vs_slsm_table_first50.tex"

summary.to_csv(summary_path, index=False)
tex_path.write_text(latex, encoding="utf-8")

print("\nSaved summary CSV:", summary_path)
print("Saved LaTeX table :", tex_path)

# ---- quick one-line headline (overall, if present) ----
def find_overall_metric_name():
    candidates = ["overall_score", "OVERALL_SCORE", "score", "SCORE"]
    for c in candidates:
        if c in summary["metric"].values:
            return c
    return None

overall_name = find_overall_metric_name()
if overall_name:
    r = summary[summary["metric"] == overall_name].iloc[0]
    print(f"\nHeadline ({overall_name}): baseline={r['baseline_mean']:.3f} | slsm={r['slsm_mean']:.3f} | Δ={r['delta_mean']:.3f} [{r['delta_ci_low']:.3f}, {r['delta_ci_high']:.3f}] | win={100*r['win_rate']:.1f}%")


/content/multi-challenge
Baseline CSV: outputs_first50/gpt4o_baseline_judge_results.csv | rows: 273 | cols: 11
SLSM CSV    : outputs_first50/gpt4o_slsm_judge_results.csv | rows: 273 | cols: 11
ID column: question_id | merged rows: 273

=== METRIC SUMMARY (means / Δ / CI / win-rate) ===


Unnamed: 0,metric,baseline_mean,slsm_mean,delta_mean,delta_ci_low,delta_ci_high,win_rate,tie_rate,lose_rate,n
0,attempt_number,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,273



=== LaTeX TABLE (copy/paste) ===

\begin{table}[t]
\centering
\small
\begin{tabular}{lrrrr}
\toprule
Metric & Baseline & SLSM & $\Delta$ (boot 95\% CI) & Win-rate \\
\midrule
attempt_number & 1.000 & 1.000 & 0.000 [0.000, 0.000] & 0.0\% \\
\bottomrule
\end{tabular}
\caption{Judge scores on the first 50 Multi-Challenge samples: baseline GPT-4o vs SLSM-controlled GPT-4o (controller: GPT-4o-mini). $\Delta$ denotes SLSM minus baseline with paired bootstrap 95\% confidence intervals; win-rate is the fraction of samples where SLSM scores strictly higher than baseline.}
\label{tab:mc_first50_judge}
\end{table}

Saved summary CSV: outputs_first50/baseline_vs_slsm_summary_first50.csv
Saved LaTeX table : outputs_first50/baseline_vs_slsm_table_first50.tex
