# MirageBench Blackbox BF16 5-Model Sweep (Colab)

This notebook runs the MirageBench blackbox pipeline on the same eval set used for the existing Qwen blackbox run, using native bf16 model loading (no quantization).

Models:
- Llama 3.1 8B Instruct
- Mistral 7B Instruct v0.3
- Gemma 2 9B IT
- Phi-3 Medium 14B Instruct
- Qwen 2.5 14B Instruct

Outputs:
- One per-model CSV (saved to Google Drive immediately after each model finishes)
- One merged CSV with a `model_name` column
- One summary table grouped by `model_name` and `compression_level` with metric means


In [None]:
# 1) Install dependencies
!pip -q install transformers==4.46.3 accelerate sentence-transformers scikit-learn pandas tqdm
print('Dependencies installed.')


In [None]:
# 2) Mount Google Drive + prepare repo path
from google.colab import drive
import os
import sys
import subprocess
from pathlib import Path

drive.mount('/content/drive')

REPO_DIR = Path('/content/mirage')
if not REPO_DIR.exists():
    print('Cloning mirage repo...')
    subprocess.run([
        'git', 'clone', 'https://github.com/jack-chaudier/mirage.git', str(REPO_DIR)
    ], check=True)
else:
    print(f'Repo already present at {REPO_DIR}')

sys.path.insert(0, str(REPO_DIR))
sys.path.insert(0, str(REPO_DIR / 'endogenous_context_theory'))

print('Python paths configured.')


In [None]:
# 3) Imports + reproducibility
import gc
import json
import random
import re
from pathlib import Path
from typing import Any, Dict, List

import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

from endogenous_context_theory.scripts.run_miragebench_ollama import (
    _load_notebook_runtime,
    _patch_runtime_with_methodology_fixes,
    _validate_investment_ground_truth,
)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f'Torch: {torch.__version__}')
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))


In [None]:
# 4) Load MirageBench runtime exactly as used in Qwen blackbox pipeline
ROOT = REPO_DIR / 'endogenous_context_theory'
NB_PATH = ROOT / 'notebooks' / 'legacy' / 'miragebench_experiments_colab.ipynb'

runtime = _load_notebook_runtime(NB_PATH)
_patch_runtime_with_methodology_fixes(runtime)

tasks = runtime['build_miragebench_v01']()
_validate_investment_ground_truth(tasks)

compression_levels = [0.4, 0.5, 0.6]  # same as blackbox run defaults

make_prompt = runtime['make_prompt']
render_compressed_variant = runtime['render_compressed_variant']
raw_validity_score = runtime['raw_validity_score']
semantic_regret = runtime['semantic_regret']

print(f'Loaded {len(tasks)} MirageBench tasks.')
print('Task IDs:', [t.task_id for t in tasks])
print('Compression levels:', compression_levels)


In [None]:
# 5) Model registry + output paths
MODELS = {
    'Llama-3.1-8B-Instruct': 'meta-llama/Llama-3.1-8B-Instruct',
    'Mistral-7B-Instruct-v0.3': 'mistralai/Mistral-7B-Instruct-v0.3',
    'Gemma-2-9B-IT': 'google/gemma-2-9b-it',
    'Phi-3-Medium-14B-Instruct': 'microsoft/Phi-3-medium-128k-instruct',
    'Qwen-2.5-14B-Instruct': 'Qwen/Qwen2.5-14B-Instruct',
}

MAX_NEW_TOKENS = 220
SKIP_IF_MODEL_CSV_EXISTS = True

DRIVE_OUT_DIR = Path('/content/drive/MyDrive/miragebench_blackbox_bf16_7model')
LOCAL_OUT_DIR = Path('/content/miragebench_blackbox_bf16_7model')
DRIVE_OUT_DIR.mkdir(parents=True, exist_ok=True)
LOCAL_OUT_DIR.mkdir(parents=True, exist_ok=True)

print('Drive output:', DRIVE_OUT_DIR)
print('Local output:', LOCAL_OUT_DIR)


In [None]:
# 6) Helpers: extraction, feasibility, generation, evaluation loop
PIVOT_PRIMARY_RE = re.compile(r'PIVOT_ID\s*=\s*([A-Z]\d{1,4}-E\d{3})')
PIVOT_FALLBACK_RE = re.compile(r'([A-Z]\d{1,4}-E\d{3})')


def extract_pivot_id(text: str, fallback_candidates: List[str] | None = None) -> str:
    if not text:
        return ''
    m = PIVOT_PRIMARY_RE.search(text)
    if m:
        return m.group(1)
    markers = PIVOT_FALLBACK_RE.findall(text)
    if markers and fallback_candidates:
        for c in fallback_candidates:
            if c in markers:
                return c
    return markers[0] if markers else ''


def compute_fixed_pivot_feasible(task: Any, full_pivot: str, compressed_context: str) -> bool:
    # Feasibility of forcing compressed instance to the full-context pivot.
    if not full_pivot:
        return False

    req_map = task.metadata.get('candidate_requirements', {}) if isinstance(task.metadata, dict) else {}
    reqs = req_map.get(full_pivot, [])

    if full_pivot not in compressed_context:
        return False
    return all(marker in compressed_context for marker in reqs)


def format_chat_prompt(tokenizer, prompt: str) -> str:
    messages = [{'role': 'user', 'content': prompt}]
    if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template:
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt


def generate_answer(model, tokenizer, prompt: str, max_new_tokens: int = MAX_NEW_TOKENS) -> str:
    input_text = format_chat_prompt(tokenizer, prompt)
    inputs = tokenizer(input_text, return_tensors='pt').to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
        )
    new_tokens = outputs[0][inputs['input_ids'].shape[1]:]
    return tokenizer.decode(new_tokens, skip_special_tokens=True).strip()


def load_bf16_model(model_id: str):
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map='auto',
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
    )
    model.eval()
    return model, tokenizer



def evaluate_model(model_name: str, model_id: str) -> pd.DataFrame:
    print(f'\n===== Evaluating {model_name} ({model_id}) =====')

    model, tokenizer = load_bf16_model(model_id)
    rows: List[Dict[str, Any]] = []

    try:
        for task in tqdm(tasks, desc=f'{model_name} tasks'):
            full_prompt = make_prompt(task.full_context, task.question)
            full_answer = generate_answer(model, tokenizer, full_prompt)
            full_pivot = extract_pivot_id(full_answer, [task.pivot_ground_truth, task.decoy_pivot])
            raw_validity_full = raw_validity_score(full_answer, task)

            for lvl in compression_levels:
                compressed_context = render_compressed_variant(task, drop_fraction=lvl, seed=SEED)
                compressed_prompt = make_prompt(compressed_context, task.question)
                compressed_answer = generate_answer(model, tokenizer, compressed_prompt)

                compressed_pivot = extract_pivot_id(
                    compressed_answer,
                    [task.pivot_ground_truth, task.decoy_pivot],
                )
                raw_validity_compressed = raw_validity_score(compressed_answer, task)

                row = {
                    'model_name': model_name,
                    'model_id': model_id,
                    'task_id': task.task_id,
                    'category': task.category,
                    'compression_level': float(lvl),
                    'full_pivot': full_pivot,
                    'pivot_id_extracted': compressed_pivot,
                    'pivot_extracted_flag': int(bool(compressed_pivot)),
                    'pivot_preserved': int(bool(full_pivot and compressed_pivot and full_pivot == compressed_pivot)),
                    'fixed_pivot_feasible': int(compute_fixed_pivot_feasible(task, full_pivot, compressed_context)),
                    'raw_validity': float(raw_validity_compressed),
                    'raw_validity_full': float(raw_validity_full),
                    'raw_validity_compressed': float(raw_validity_compressed),
                    'semantic_regret': float(semantic_regret(full_answer, compressed_answer)),
                    'full_pivot_matches_ground_truth': int(full_pivot == task.pivot_ground_truth),
                    'pivot_matches_ground_truth': int(compressed_pivot == task.pivot_ground_truth),
                    'full_answer': full_answer,
                    'compressed_answer': compressed_answer,
                }
                rows.append(row)

        df = pd.DataFrame(rows)
        return df

    finally:
        del model
        del tokenizer
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()


In [None]:
# 7) Run all models with per-model checkpoint saves to Google Drive
all_frames: List[pd.DataFrame] = []

for model_name, model_id in MODELS.items():
    model_slug = model_name.lower().replace('/', '_').replace(' ', '_').replace('.', '_').replace('-', '_')
    drive_csv = DRIVE_OUT_DIR / f'{model_slug}_results.csv'
    local_csv = LOCAL_OUT_DIR / f'{model_slug}_results.csv'

    if SKIP_IF_MODEL_CSV_EXISTS and drive_csv.exists():
        print(f'Skipping {model_name}; found existing checkpoint: {drive_csv}')
        df_model = pd.read_csv(drive_csv)
        all_frames.append(df_model)
        continue

    df_model = evaluate_model(model_name, model_id)
    df_model.to_csv(local_csv, index=False)
    df_model.to_csv(drive_csv, index=False)
    print(f'Saved checkpoint CSV for {model_name}')
    print(f'  Local: {local_csv}')
    print(f'  Drive: {drive_csv}')

    all_frames.append(df_model)

if not all_frames:
    raise RuntimeError('No model outputs available.')

merged = pd.concat(all_frames, ignore_index=True)
merged_local = LOCAL_OUT_DIR / 'miragebench_bf16_7model_merged.csv'
merged_drive = DRIVE_OUT_DIR / 'miragebench_bf16_7model_merged.csv'
merged.to_csv(merged_local, index=False)
merged.to_csv(merged_drive, index=False)

print('\nMerged CSV saved:')
print('  Local:', merged_local)
print('  Drive:', merged_drive)
print('Rows:', len(merged))


In [None]:
# 8) Summary table: grouped by model and compression level (means of 5 metrics)
summary_metrics = [
    'raw_validity',
    'pivot_extracted_flag',
    'pivot_preserved',
    'fixed_pivot_feasible',
    'semantic_regret',
]

summary = (
    merged.groupby(['model_name', 'compression_level'], as_index=False)[summary_metrics]
    .mean()
    .sort_values(['model_name', 'compression_level'])
)

summary_local = LOCAL_OUT_DIR / 'miragebench_bf16_7model_summary_by_model_compression.csv'
summary_drive = DRIVE_OUT_DIR / 'miragebench_bf16_7model_summary_by_model_compression.csv'
summary.to_csv(summary_local, index=False)
summary.to_csv(summary_drive, index=False)

print('Summary CSV saved:')
print('  Local:', summary_local)
print('  Drive:', summary_drive)

summary


## Notes

- This notebook uses the same MirageBench task generation path as the existing Qwen blackbox pipeline by loading runtime cells from `miragebench_experiments_colab.ipynb` and applying `_patch_runtime_with_methodology_fixes`.
- Models are loaded in native bf16 (`torch_dtype=torch.bfloat16`) with `device_map="auto"`; no bitsandbytes quantization is used in this notebook.
- Per-model CSVs are written to Google Drive immediately after each model finishes.
- If Colab disconnects, rerun and keep `SKIP_IF_MODEL_CSV_EXISTS=True` to resume from saved checkpoints.
