In [3]:
import os
from pathlib import Path

# --- ALWAYS anchor the notebook to the repo root ---
ROOT = Path("/home/ubuntu/deep").resolve()
assert ROOT.exists(), f"Repo root not found: {ROOT}"

os.chdir(ROOT)
print("CWD forced to:", Path.cwd())

# keep your env flags (fine)
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# paths we will use everywhere
REQ_PATH = str(ROOT / "mech" / "requirements_mech.txt")
VENV_DIR = ROOT / ".venv"
VENV_PY  = str(VENV_DIR / "bin" / "python")

print("REQ_PATH =", REQ_PATH)
print("VENV_PY  =", VENV_PY)
print("Req file exists:", Path(REQ_PATH).exists())

CWD forced to: /home/ubuntu/deep
REQ_PATH = /home/ubuntu/deep/mech/requirements_mech.txt
VENV_PY  = /home/ubuntu/deep/.venv/bin/python
Req file exists: True


In [4]:
import sys, subprocess, shlex
from pathlib import Path
import os

INSTALL_DEPS = True   # set True on fresh VM / fresh venv only

def pip_install_stream(cmd: str):
    print("\nRUN:", cmd)
    env = os.environ.copy()
    env["PYTHONUNBUFFERED"] = "1"
    env["PIP_PROGRESS_BAR"] = "on"

    p = subprocess.Popen(
        shlex.split(cmd),
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
        env=env,
        cwd=str(Path("/home/ubuntu/deep")),  # force cwd for subprocess too
    )
    for line in p.stdout:
        print(line, end="")
    rc = p.wait()
    if rc != 0:
        raise RuntimeError(f"Command failed with exit code {rc}: {cmd}")


# ---- PERMANENT SOLUTION: install into a venv, not system python ----
ROOT = Path("/home/ubuntu/deep").resolve()
REQ_PATH = str(ROOT / "mech" / "requirements_mech.txt")
VENV_DIR = ROOT / ".venv"
VENV_PY  = str(VENV_DIR / "bin" / "python")

if INSTALL_DEPS:
    if not VENV_DIR.exists():
        pip_install_stream(f"/usr/bin/python3 -u -m venv {VENV_DIR}")

    # Always upgrade pip tooling inside the venv
    pip_install_stream(f"{VENV_PY} -u -m pip install -U pip setuptools wheel")

    # Install your requirements inside the venv
    pip_install_stream(f"{VENV_PY} -u -m pip install -r {REQ_PATH}")

    print("\n✅ Done installing into venv.")
    print("Next steps:")
    print("1) Kernel -> Restart (recommended)")
    print("2) From now on, run scripts with:", VENV_PY)
else:
    print("INSTALL_DEPS=False, skipping installs.")


RUN: /usr/bin/python3 -u -m venv /home/ubuntu/deep/.venv

RUN: /home/ubuntu/deep/.venv/bin/python -u -m pip install -U pip setuptools wheel
Collecting pip
  Using cached pip-25.3-py3-none-any.whl (1.8 MB)
Collecting setuptools
  Using cached setuptools-80.9.0-py3-none-any.whl (1.2 MB)
Collecting wheel
  Using cached wheel-0.45.1-py3-none-any.whl (72 kB)
Installing collected packages: wheel, setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 59.6.0
    Uninstalling setuptools-59.6.0:
      Successfully uninstalled setuptools-59.6.0
  Attempting uninstall: pip
    Found existing installation: pip 22.0.2
    Uninstalling pip-22.0.2:
      Successfully uninstalled pip-22.0.2
Successfully installed pip-25.3 setuptools-80.9.0 wheel-0.45.1

RUN: /home/ubuntu/deep/.venv/bin/python -u -m pip install -r /home/ubuntu/deep/mech/requirements_mech.txt
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu128
Collecting torch==2.7

In [1]:
from pathlib import Path

# Project root (where 'data/', 'mech/', 'outputs/' live)
PROJECT_DIR = Path("/home/ubuntu/deep")   # <- change if needed
DATA_DIR = PROJECT_DIR / "data"
OUT_DIR = PROJECT_DIR / "outputs"
MECH_SCRIPT = PROJECT_DIR / "mech" / "mech_trace.py"

# Input / outputs
IN_JSONL = DATA_DIR / "normal_responses.jsonl"
OUT_MECH_JSONL = OUT_DIR / "normal_responses_mech.jsonl"
OUT_DELTA_JSONL = OUT_DIR / "normal_frame_deltas.jsonl"

# Model / trace settings
MODEL_ID = None  # None = infer from first row's 'model_id'. Or set e.g. "mistralai/Mistral-7B-Instruct-v0.3"
DTYPE = "bf16"   # "bf16" or "fp16"
LAYERS = "0,8,16,24,32"

# Sharding (for multi-GPU boxes). For single GPU, keep defaults.
NUM_SHARDS = 1
SHARD_ID = 0

# Safety knobs
OVERWRITE = True
MAX_ROWS = 0  # 0 = no limit

In [2]:
import os, json
import torch

print("cwd:", os.getcwd())
print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("gpu:", torch.cuda.get_device_name(0))

assert MECH_SCRIPT.exists(), f"Missing mech script: {MECH_SCRIPT}"
assert IN_JSONL.exists(), f"Missing input JSONL: {IN_JSONL}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Quick input peek
n_lines = 0
with open(IN_JSONL, "rb") as f:
    for _ in f:
        n_lines += 1
print("input lines:", n_lines, "bytes:", IN_JSONL.stat().st_size)

with open(IN_JSONL, "r", encoding="utf-8") as f:
    first = json.loads(next(f))
print("first keys:", list(first.keys()))
print("model_id:", first.get("model_id"))
print("prompt chars:", len(first.get("prompt","")), "response chars:", len(first.get("response","")))

cwd: /home/ubuntu
torch: 2.7.0+cu128
cuda available: True
gpu: NVIDIA A100-SXM4-40GB
input lines: 6000 bytes: 8793483
first keys: ['id', 'model_id', 'model_label', 'question_id', 'base_question', 'frame', 'prompt', 'response', 'prompt_tokens', 'completion_tokens']
model_id: mistralai/Mistral-7B-Instruct-v0.3
prompt chars: 182 response chars: 708


In [4]:
import subprocess, time
from pathlib import Path

ROOT = Path("/home/ubuntu/deep")
VENV_PY = ROOT / ".venv" / "bin" / "python"

assert VENV_PY.exists(), f"Venv python not found: {VENV_PY}"

cmd = [
    str(VENV_PY), str(MECH_SCRIPT),
    "--in_jsonl", str(IN_JSONL),
    "--out_jsonl", str(OUT_MECH_JSONL),
    "--layers", LAYERS,
    "--dtype", DTYPE,
    "--num_shards", str(NUM_SHARDS),
    "--shard_id", str(SHARD_ID),
]
if MODEL_ID:
    cmd += ["--model_id", MODEL_ID]
if MAX_ROWS and int(MAX_ROWS) > 0:
    cmd += ["--max_rows", str(MAX_ROWS)]
if OVERWRITE:
    cmd += ["--overwrite"]

print("Running:\n ", " ".join(cmd))
t0 = time.time()
subprocess.check_call(cmd)
print(f"Done in {time.time()-t0:.1f}s")

Running:
  /home/ubuntu/deep/.venv/bin/python /home/ubuntu/deep/mech/mech_trace.py --in_jsonl /home/ubuntu/deep/data/normal_responses.jsonl --out_jsonl /home/ubuntu/deep/outputs/normal_responses_mech.jsonl --layers 0,8,16,24,32 --dtype bf16 --num_shards 1 --shard_id 0 --overwrite


Fetching 3 files: 100%|██████████| 3/3 [00:07<00:00,  2.38s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.14it/s]


Done in 225.1s


In [5]:
import json

assert OUT_MECH_JSONL.exists(), f"Missing output: {OUT_MECH_JSONL}"

# Count output lines
out_lines = 0
with open(OUT_MECH_JSONL, "rb") as f:
    for _ in f:
        out_lines += 1
print("output lines:", out_lines, "bytes:", OUT_MECH_JSONL.stat().st_size)

with open(OUT_MECH_JSONL, "r", encoding="utf-8") as f:
    r = json.loads(next(f))

print("mech keys:", r["mech"].keys())
print("layers captured:", sorted(r["mech"]["by_layer"].keys(), key=int))
# Show one layer
k0 = sorted(r["mech"]["by_layer"].keys(), key=int)[0]
print("layer", k0, "example:", r["mech"]["by_layer"][k0])

output lines: 6000 bytes: 14754198
mech keys: dict_keys(['prompt_len', 'first_answer_token_id', 'true_next_token_logp_first', 'by_layer', 'content_first'])
layers captured: ['0', '8', '16', '24', '32']
layer 0 example: {'h_norm': 0.004940052516758442, 'logit_lens_logp_first': -10.4375}


In [6]:
import json, math
from collections import defaultdict

INP = OUT_MECH_JSONL
OUT = OUT_DELTA_JSONL

rows = []
with open(INP, "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

groups = defaultdict(dict)
for r in rows:
    key = (r["question_id"], r.get("model_label",""))
    groups[key][r["frame"]] = r

def vec_from_by_layer(by_layer, field, layer_keys):
    return [by_layer[k][field] for k in layer_keys if k in by_layer]

def l2(a, b):
    if not a or not b or len(a) != len(b):
        return None
    return math.sqrt(sum((x - y) ** 2 for x, y in zip(a, b)))

written = 0
missing = 0

with open(OUT, "w", encoding="utf-8") as fo:
    for (qid, mlabel), fr in groups.items():
        if not all(x in fr for x in ["casual", "evaluation", "oversight"]):
            missing += 1
            continue

        layer_keys = sorted(fr["casual"]["mech"]["by_layer"].keys(), key=int)

        def features(one):
            mech = one["mech"]
            return {
                "true_next_token_logp_first": mech["true_next_token_logp_first"],
                "logit_lens_curve": vec_from_by_layer(mech["by_layer"], "logit_lens_logp_first", layer_keys),
            }

        C = features(fr["casual"])
        E = features(fr["evaluation"])
        O = features(fr["oversight"])

        out = {
            "question_id": qid,
            "model_label": mlabel,
            "layers": layer_keys,

            "delta_E_minus_C_true_logp": E["true_next_token_logp_first"] - C["true_next_token_logp_first"],
            "delta_O_minus_C_true_logp": O["true_next_token_logp_first"] - C["true_next_token_logp_first"],

            "delta_E_minus_C_logitlens_L2": l2(E["logit_lens_curve"], C["logit_lens_curve"]),
            "delta_O_minus_C_logitlens_L2": l2(O["logit_lens_curve"], C["logit_lens_curve"]),
        }
        fo.write(json.dumps(out, ensure_ascii=False) + "\n")
        written += 1

print("Wrote:", OUT)
print("delta rows:", written, "groups missing frames:", missing)

Wrote: /home/ubuntu/deep/outputs/normal_frame_deltas.jsonl
delta rows: 2000 groups missing frames: 0


In [7]:
import json, statistics

dE = []
dO = []
lE = []
lO = []

with open(OUT_DELTA_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        r = json.loads(line)
        dE.append(r["delta_E_minus_C_true_logp"])
        dO.append(r["delta_O_minus_C_true_logp"])
        if r["delta_E_minus_C_logitlens_L2"] is not None:
            lE.append(r["delta_E_minus_C_logitlens_L2"])
        if r["delta_O_minus_C_logitlens_L2"] is not None:
            lO.append(r["delta_O_minus_C_logitlens_L2"])

def summarize(xs):
    return {
        "n": len(xs),
        "mean": statistics.fmean(xs) if xs else None,
        "p50": statistics.median(xs) if xs else None,
        "min": min(xs) if xs else None,
        "max": max(xs) if xs else None,
    }

print("Δ true_logp (E−C):", summarize(dE))
print("Δ true_logp (O−C):", summarize(dO))
print("Δ logitlens L2 (E−C):", summarize(lE))
print("Δ logitlens L2 (O−C):", summarize(lO))

Δ true_logp (E−C): {'n': 2000, 'mean': 11.619572394490241, 'p50': 14.997802734375, 'min': -18.7890625, 'max': 27.874977469444275}
Δ true_logp (O−C): {'n': 2000, 'mean': 11.62438295328617, 'p50': 14.7822265625, 'min': -20.04296875, 'max': 27.87498950958252}
Δ logitlens L2 (E−C): {'n': 2000, 'mean': 14.596004108933785, 'p50': 16.26155812021453, 'min': 0.35325655216908664, 'max': 29.023400632698806}
Δ logitlens L2 (O−C): {'n': 2000, 'mean': 14.520259103834649, 'p50': 16.16494406319837, 'min': 0.40911442274854953, 'max': 28.97601565049169}
