# Mirage-Aware Base Model Evaluation
Runs base Qwen2.5-7B-Instruct eval on validation set. Results are saved for use in the main training+eval notebook.


In [None]:
!pip install -q transformers==4.46.3 peft==0.13.2 trl==0.12.2 accelerate==1.1.1 bitsandbytes>=0.46.1 datasets
print("Dependencies installed.")


In [None]:
# Verify GPU
import torch
assert torch.cuda.is_available(), "No GPU detected!"
gpu_name = torch.cuda.get_device_name(0)
gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f"GPU: {gpu_name} ({gpu_mem:.1f} GB)")


In [None]:
import os
import json
from google.colab import files

DATA_DIR = "/content/training_data"
os.makedirs(DATA_DIR, exist_ok=True)

valid_path = os.path.join(DATA_DIR, "valid.jsonl")

if not os.path.exists(valid_path):
    print("Upload valid.jsonl:")
    uploaded = files.upload()
    for fname, content in uploaded.items():
        dest = os.path.join(DATA_DIR, fname)
        with open(dest, "wb") as f:
            f.write(content)
        print(f"  Saved {fname} -> {dest}")
else:
    print(f"Data already present at {DATA_DIR}")

# Quick verify
count = 0
for line in open(valid_path):
    count += 1
print(f"Valid: {count} examples")


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"

print(f"Loading {MODEL_ID} in bfloat16...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Model loaded. Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")


In [None]:
import re

def generate_response(model, tokenizer, prompt, max_new_tokens=512):
    # Generate a response from a prompt string.
    messages = [{"role": "user", "content": prompt}]
    input_text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
        )

    new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(new_tokens, skip_special_tokens=True)


def extract_pivot(text):
    # Extract pivot ID. Supports 1-4 digit actor IDs.
    match = re.search(r"PIVOT_ID\s*=\s*([A-Z]\d{1,4}-E\d{3})", text)
    if match:
        return match.group(1)
    # Fallback: find any pivot-shaped ID in text
    match = re.search(r"([A-Z]\d{1,4}-E\d{3})", text)
    return match.group(1) if match else "UNKNOWN"


def check_degraded_flag(text):
    # Check if model flagged evidence as degraded.
    if re.search(r"Evidence assessment:\s*DEGRADED", text, re.IGNORECASE):
        return True

    text_lower = text.lower()
    return any(
        kw in text_lower
        for kw in [
            "degraded",
            "incomplete",
            "missing prerequisite",
            "compressed evidence",
            "not found in context",
            "confidence: low",
            "evidence_status=degraded",
        ]
    )


def check_format_adherence(text):
    # Check if response follows the structured output protocol.
    has_pivot = bool(re.search(r"PIVOT_ID\s*=", text))
    has_evidence = bool(
        re.search(r"Evidence assessment:\s*(STRONG|DEGRADED)", text, re.IGNORECASE)
    )
    return has_pivot and has_evidence



print("Helper functions defined.")


In [None]:
import random
from collections import Counter

valid_examples = []
with open(valid_path) as f:
    for line in f:
        valid_examples.append(json.loads(line))

# Shuffle for stratified coverage across difficulties/categories
# MUST match main notebook: seed=42, EVAL_N=400
random.seed(42)
random.shuffle(valid_examples)

EVAL_N = 400
eval_subset = valid_examples[:EVAL_N]

print(f"Will evaluate on {EVAL_N} examples.")

sample = eval_subset[0]
oracle_fields = [k for k in sample.keys() if k != "messages"]
print(f"Oracle metadata fields: {oracle_fields}")

# Verify distribution
diff_dist = Counter(ex.get("difficulty", "unknown") for ex in eval_subset)
deg_dist = Counter("degraded" if ex.get("evidence_degraded") else "strong" for ex in eval_subset)
print(f"Difficulty: {dict(diff_dist)}")
print(f"Labels: {dict(deg_dist)}")


In [None]:
import gc

base_results = []
print(f"Generating {EVAL_N} base model responses...")
for i, ex in enumerate(eval_subset):
    prompt = ex["messages"][0]["content"]
    response = generate_response(model, tokenizer, prompt)
    base_results.append(
        {
            "response": response,
            "pivot": extract_pivot(response),
            "flagged_degraded": check_degraded_flag(response),
            "flagged_degraded_strict": bool(re.search(r"Evidence assessment:\s*DEGRADED", response, re.IGNORECASE)),
            "format_ok": check_format_adherence(response),
        }
    )
    if (i + 1) % 20 == 0 or (i + 1) == EVAL_N:
        print(f"  [{i+1}/{EVAL_N}] done")

print("Base model eval complete.")


In [None]:
# Save raw responses for re-parsing without rerunning inference
with open("/content/base_raw_responses.json", "w") as f:
    json.dump([{"idx": i, "response": r["response"]} for i, r in enumerate(base_results)], f)
print("Raw responses saved to /content/base_raw_responses.json")

with open("/content/base_results_backup.json", "w") as f:
    json.dump(base_results, f)
print(f"Parsed results saved to /content/base_results_backup.json ({len(base_results)} examples)")

# Quick stats
n_unknown = sum(1 for r in base_results if r["pivot"] == "UNKNOWN")
n_flagged = sum(1 for r in base_results if r["flagged_degraded"])
n_flagged_strict = sum(1 for r in base_results if r["flagged_degraded_strict"])
n_format = sum(1 for r in base_results if r["format_ok"])
print(f"\nQuick stats:")
print(f"  UNKNOWN pivots: {n_unknown}/{EVAL_N}")
print(f"  Flagged degraded (loose): {n_flagged}/{EVAL_N}")
print(f"  Flagged degraded (strict): {n_flagged_strict}/{EVAL_N}")
print(f"  Format adherence: {n_format}/{EVAL_N}")


In [None]:
from google.colab import files
files.download("/content/base_raw_responses.json")
files.download("/content/base_results_backup.json")
print("Download complete. Upload these to the main notebook after training finishes.")
