# Mirage-Resistant Finetune + Eval (H100 Overnight)

End-to-end Colab pipeline with three phases:
1. Mine base-model failures into SFT + DPO preference data.
2. LoRA fine-tune `meta-llama/Llama-3.1-8B-Instruct`.
3. Evaluate on harder held-out MirageBench-style tasks.

All generation is greedy (`do_sample=False`, `max_new_tokens=512`), fully checkpointed to Google Drive.

In [None]:
# Cell 1 — Install dependencies
!pip -q install -U "transformers>=4.46.3" "peft>=0.13.2" "trl>=0.12.2" datasets accelerate sentencepiece scipy scikit-learn
print('Dependencies installed.')

In [None]:
# Cell 2 — Mount Drive and configure repo path
from google.colab import drive
import subprocess
import sys
from pathlib import Path

drive.mount('/content/drive')

REPO_DIR = Path('/content/mirage')
ZIP_FALLBACK = Path('/content/drive/MyDrive/mirage_repo_for_colab_clean.zip')

if not REPO_DIR.exists() and ZIP_FALLBACK.exists():
    print(f'Unzipping repo from {ZIP_FALLBACK} ...')
    subprocess.run(['unzip', '-q', str(ZIP_FALLBACK), '-d', '/content'], check=True)

if not REPO_DIR.exists():
    raise FileNotFoundError(
        'Repo not found at /content/mirage. Upload/unzip the repo there, then re-run this cell.'
    )

if str(REPO_DIR) not in sys.path:
    sys.path.insert(0, str(REPO_DIR))
if str(REPO_DIR / 'endogenous_context_theory') not in sys.path:
    sys.path.insert(0, str(REPO_DIR / 'endogenous_context_theory'))

print('Repo ready at', REPO_DIR)
print('Python paths configured.')

In [None]:
# Cell 3 — Imports, seeds, GPU sanity
import gc
import json
import os
import random
import re
import shutil
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional

import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f'Torch: {torch.__version__}')
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f'GPU memory: {total_mem_gb:.1f} GB')


In [None]:
# Cell 4 — Load MirageBench runtime + tasks from JSON
from endogenous_context_theory.scripts.run_miragebench_ollama import (
    _load_notebook_runtime,
    _patch_runtime_with_methodology_fixes,
    _validate_investment_ground_truth,
)

ROOT = REPO_DIR / 'endogenous_context_theory'
NB_PATH = ROOT / 'notebooks' / 'legacy' / 'miragebench_experiments_colab.ipynb'
TASKS_PATH = ROOT / 'release' / 'miragebench_tasks' / 'miragebench_v01_tasks.json'

runtime = _load_notebook_runtime(NB_PATH)
_patch_runtime_with_methodology_fixes(runtime)

MirageBenchTask = runtime['MirageBenchTask']
make_prompt = runtime['make_prompt']
render_compressed_variant = runtime['render_compressed_variant']
raw_validity_score = runtime['raw_validity_score']
semantic_regret = runtime['semantic_regret']

with open(TASKS_PATH, 'r') as f:
    task_dicts = json.load(f)

tasks = [MirageBenchTask(**row) for row in task_dicts]
_validate_investment_ground_truth(tasks)

MODEL_ID = 'meta-llama/Llama-3.1-8B-Instruct'
RETENTION_LEVELS = [1.0, 0.7, 0.5, 0.4, 0.3]
MAX_NEW_TOKENS = 512

PHASE1_OUT = Path('/content/mirage_finetune_data')
PHASE1_DRIVE_OUT = Path('/content/drive/MyDrive/mirage_finetune_data')
ADAPTER_OUT_DIR = Path('/content/drive/MyDrive/mirage_resistant_adapter')
EVAL_OUT_DIR = Path('/content/drive/MyDrive/mirage_resistant_eval')

for p in [PHASE1_OUT, PHASE1_DRIVE_OUT, ADAPTER_OUT_DIR, EVAL_OUT_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print(f'Loaded {len(tasks)} tasks from {TASKS_PATH}')
print('Task IDs:', [t.task_id for t in tasks])
print('Retention levels:', RETENTION_LEVELS)


In [None]:
# Cell 5 — Load base model + shared helper functions
PIVOT_PRIMARY_RE = re.compile(r'PIVOT_ID\s*=\s*([A-Z]{1,5}\d{1,4}-E\d{3})')
PIVOT_FALLBACK_RE = re.compile(r'([A-Z]{1,5}\d{1,4}-E\d{3})')


def gpu_mem_report(label: str) -> None:
    if not torch.cuda.is_available():
        return
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    print(f'[{label}] GPU allocated={allocated:.2f} GB, reserved={reserved:.2f} GB')


def extract_pivot_id(text: str, fallback_candidates: Optional[List[str]] = None) -> str:
    if not text:
        return ''
    m = PIVOT_PRIMARY_RE.search(text)
    if m:
        return m.group(1)
    markers = PIVOT_FALLBACK_RE.findall(text)
    if markers and fallback_candidates:
        for candidate in fallback_candidates:
            if candidate in markers:
                return candidate
    return markers[0] if markers else ''


def compute_fixed_pivot_feasible(task: Any, full_pivot: str, context_text: str) -> bool:
    if not full_pivot:
        return False
    req_map = task.metadata.get('candidate_requirements', {}) if isinstance(task.metadata, dict) else {}
    reqs = req_map.get(full_pivot, [])
    if full_pivot not in context_text:
        return False
    return all(marker in context_text for marker in reqs)


def format_chat_prompt(tokenizer, prompt: str) -> str:
    messages = [{'role': 'user', 'content': prompt}]
    if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template:
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt


def generate_response(model, tokenizer, prompt: str, max_new_tokens: int = MAX_NEW_TOKENS) -> str:
    input_text = format_chat_prompt(tokenizer, prompt)
    inputs = tokenizer(input_text, return_tensors='pt').to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
        )
    new_tokens = outputs[0][inputs['input_ids'].shape[1]:]
    return tokenizer.decode(new_tokens, skip_special_tokens=True).strip()


def get_record_line(task: Any, marker: str) -> str:
    for rec in task.metadata.get('records', []):
        if rec.get('marker') == marker:
            return rec.get('line', '')
    return ''


def summarize_pivot_score(task: Any, marker: str) -> str:
    line = get_record_line(task, marker)
    if not line:
        return 'dominant composite score'

    m = re.search(r'Composite=([0-9]+(?:\.[0-9]+)?)', line)
    if m:
        return f'Composite={m.group(1)}'

    m = re.search(r'CumulativeReturn=([+-]?[0-9]+(?:\.[0-9]+)?)%', line)
    if m:
        return f'CumulativeReturn={m.group(1)}%'

    m = re.search(r'ConsequenceScore=([0-9]+(?:\.[0-9]+)?)', line)
    if m:
        return f'ConsequenceScore={m.group(1)}'

    return 'dominant score'


def build_chosen_completion(task: Any, correct_pivot: str, prereqs: List[str], missing_prereqs: List[str]) -> str:
    score_text = summarize_pivot_score(task, correct_pivot)
    prereq_chain = ' -> '.join(prereqs + [correct_pivot]) if prereqs else correct_pivot

    lines = [
        f'PIVOT_ID={correct_pivot}',
        f'The correct pivot is {correct_pivot} because it has the highest {score_text} in the full timeline for this task.',
        f'The prerequisite chain supporting this pivot is {prereq_chain}, which preserves the endogenous turning-point semantics.',
    ]

    if missing_prereqs:
        missing_fmt = ', '.join(missing_prereqs)
        lines.append(
            f'[PREREQUISITE_GAP: events {{{missing_fmt}}} not present in compressed context. Confidence degraded.]'
        )

    return '\n\n'.join(lines)


print(f'Loading base model: {MODEL_ID}')
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
base_model.eval()

gpu_mem_report('after base model load')

In [None]:
# Cell 6 — Phase 1: mine negatives and build SFT/DPO dataset
rows: List[Dict[str, Any]] = []
by_level = defaultdict(lambda: {'pairs': 0, 'sft_only': 0, 'total': 0})

for task in tqdm(tasks, desc='Mining base failures (task loop)'):
    correct_pivot = task.pivot_ground_truth
    req_map = task.metadata.get('candidate_requirements', {})
    correct_reqs = list(req_map.get(correct_pivot, []))

    for retention in tqdm(RETENTION_LEVELS, leave=False, desc=f'{task.task_id} retentions'):
        drop_fraction = max(0.0, min(1.0, 1.0 - retention))
        context_text = task.full_context if retention >= 0.999 else render_compressed_variant(
            task,
            drop_fraction=drop_fraction,
            seed=SEED,
        )
        prompt = make_prompt(context_text, task.question)
        base_completion = generate_response(base_model, tokenizer, prompt)
        base_pivot = extract_pivot_id(base_completion, [task.pivot_ground_truth, task.decoy_pivot])
        base_was_correct = bool(base_pivot == correct_pivot)

        missing = [m for m in correct_reqs if m not in context_text]
        chosen = build_chosen_completion(task, correct_pivot, correct_reqs, missing)
        rejected = None if base_was_correct else base_completion

        rows.append(
            {
                'prompt': prompt,
                'chosen': chosen,
                'rejected': rejected,
                'task_id': task.task_id,
                'compression': float(retention),
                'base_pivot': base_pivot,
                'correct_pivot': correct_pivot,
                'base_was_correct': int(base_was_correct),
            }
        )

        by_level[retention]['total'] += 1
        if base_was_correct:
            by_level[retention]['sft_only'] += 1
        else:
            by_level[retention]['pairs'] += 1

phase1_df = pd.DataFrame(rows)
ordered_cols = [
    'prompt', 'chosen', 'rejected', 'task_id', 'compression',
    'base_pivot', 'correct_pivot', 'base_was_correct'
]
phase1_df = phase1_df[ordered_cols]

pair_df = phase1_df[phase1_df['rejected'].notna()].copy()
sft_df = phase1_df.copy()

phase1_ds = Dataset.from_pandas(phase1_df, preserve_index=False)
pair_ds = Dataset.from_pandas(pair_df, preserve_index=False)
sft_ds = Dataset.from_pandas(sft_df[['prompt', 'chosen', 'task_id', 'compression']], preserve_index=False)

bundle = DatasetDict({'all': phase1_ds, 'pairs': pair_ds, 'sft': sft_ds})
if PHASE1_OUT.exists():
    shutil.rmtree(PHASE1_OUT)
bundle.save_to_disk(str(PHASE1_OUT))

if PHASE1_DRIVE_OUT.exists():
    shutil.rmtree(PHASE1_DRIVE_OUT)
shutil.copytree(PHASE1_OUT, PHASE1_DRIVE_OUT)

phase1_csv = PHASE1_OUT / 'phase1_examples.csv'
phase1_df.to_csv(phase1_csv, index=False)
phase1_df.to_csv(PHASE1_DRIVE_OUT / 'phase1_examples.csv', index=False)

print('=' * 72)
print('PHASE 1 SUMMARY')
print('=' * 72)
print(f'Total examples: {len(phase1_df)}')
print(f'Preference pairs: {len(pair_df)}')
print(f'SFT-only examples: {len(phase1_df) - len(pair_df)}')
print('\nBy retention level:')
for lvl in RETENTION_LEVELS:
    s = by_level[lvl]
    print(f'  retention={lvl:.1f}: total={s["total"]}, pairs={s["pairs"]}, sft_only={s["sft_only"]}')

print(f'\nSaved HF dataset to: {PHASE1_OUT}')
print(f'Backed up to Drive:   {PHASE1_DRIVE_OUT}')

In [None]:
# Cell 7 — Phase 2: prepare LoRA + training datasets
from peft import LoraConfig, TaskType, get_peft_model

# Convert chosen completions into chat-formatted SFT text

def to_chat_text(prompt: str, completion: str) -> str:
    messages = [
        {'role': 'user', 'content': prompt},
        {'role': 'assistant', 'content': completion},
    ]
    if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template:
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    return f'User:\n{prompt}\n\nAssistant:\n{completion}'

sft_train_df = phase1_df[['prompt', 'chosen']].copy()
sft_train_df['text'] = [to_chat_text(p, c) for p, c in zip(sft_train_df['prompt'], sft_train_df['chosen'])]
sft_train_dataset = Dataset.from_pandas(sft_train_df[['text']], preserve_index=False)

dpo_train_df = pair_df[['prompt', 'chosen', 'rejected']].copy()
dpo_train_dataset = Dataset.from_pandas(dpo_train_df, preserve_index=False)

lora_targets = ['q_proj', 'k_proj', 'v_proj', 'o_proj']
matched_modules = [
    name for name, _ in base_model.named_modules()
    if any(name.endswith(t) for t in lora_targets)
]
print(f'Matched LoRA modules: {len(matched_modules)}')
if len(matched_modules) == 0:
    raise RuntimeError('No LoRA target modules matched. Aborting to avoid null training run.')

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=lora_targets,
    task_type=TaskType.CAUSAL_LM,
    bias='none',
)

ft_model = get_peft_model(base_model, lora_config)
ft_model.print_trainable_parameters()

gpu_mem_report('before training')
print(f'DPO candidate pairs: {len(dpo_train_dataset)}')
print(f'SFT samples: {len(sft_train_dataset)}')

In [None]:
# Cell 8 — Train (DPO if viable) then SFT (TRL-version compatible)
import inspect
from transformers import TrainingArguments
from trl import SFTTrainer

try:
    from trl import DPOTrainer
    HAS_DPO = True
except Exception:
    HAS_DPO = False

try:
    from trl import SFTConfig
except Exception:
    SFTConfig = None

try:
    from trl import DPOConfig
except Exception:
    DPOConfig = None

DPO_MIN_PAIRS = 15
RUN_DPO = HAS_DPO and (len(dpo_train_dataset) >= DPO_MIN_PAIRS)
DPO_BETA = 0.1
WARMUP_STEPS = 10

print(f'HAS_DPO={HAS_DPO}, RUN_DPO={RUN_DPO}, pair_count={len(dpo_train_dataset)}')

DPO_OUT = ADAPTER_OUT_DIR / 'dpo_stage'
SFT_OUT = ADAPTER_OUT_DIR / 'sft_stage'
DPO_OUT.mkdir(parents=True, exist_ok=True)
SFT_OUT.mkdir(parents=True, exist_ok=True)

dpo_ran = False
if RUN_DPO:
    try:
        if DPOConfig is not None:
            dpo_args = DPOConfig(
                output_dir=str(DPO_OUT),
                num_train_epochs=1,
                per_device_train_batch_size=1,
                gradient_accumulation_steps=4,
                learning_rate=2e-4,
                lr_scheduler_type='cosine',
                warmup_steps=WARMUP_STEPS,
                bf16=True,
                logging_steps=5,
                save_strategy='epoch',
                save_total_limit=3,
                report_to='none',
                beta=DPO_BETA,
                max_length=2560,
                max_prompt_length=2048,
            )
        else:
            dpo_args = TrainingArguments(
                output_dir=str(DPO_OUT),
                num_train_epochs=1,
                per_device_train_batch_size=1,
                gradient_accumulation_steps=4,
                learning_rate=2e-4,
                lr_scheduler_type='cosine',
                warmup_steps=WARMUP_STEPS,
                bf16=True,
                logging_steps=5,
                save_strategy='epoch',
                save_total_limit=3,
                report_to='none',
            )

        dpo_kwargs = {
            'model': ft_model,
            'ref_model': None,
            'args': dpo_args,
            'train_dataset': dpo_train_dataset,
        }

        dpo_sig = inspect.signature(DPOTrainer.__init__)
        if 'beta' in dpo_sig.parameters and DPOConfig is None:
            dpo_kwargs['beta'] = DPO_BETA
        if 'tokenizer' in dpo_sig.parameters:
            dpo_kwargs['tokenizer'] = tokenizer
        elif 'processing_class' in dpo_sig.parameters:
            dpo_kwargs['processing_class'] = tokenizer
        if 'max_length' in dpo_sig.parameters and DPOConfig is None:
            dpo_kwargs['max_length'] = 2560
        if 'max_prompt_length' in dpo_sig.parameters and DPOConfig is None:
            dpo_kwargs['max_prompt_length'] = 2048

        dpo_trainer = DPOTrainer(**dpo_kwargs)
        dpo_trainer.train()
        ft_model = dpo_trainer.model
        dpo_ran = True
        print('DPO stage complete.')
    except Exception as exc:
        print(f'DPO stage failed; falling back to SFT-only. Error: {exc}')
        dpo_ran = False
else:
    print('Skipping DPO stage; running SFT-only path.')

sft_epochs = 1 if dpo_ran else 3
print(f'Running SFT for {sft_epochs} epoch(s).')

if SFTConfig is not None:
    sft_args = SFTConfig(
        output_dir=str(SFT_OUT),
        num_train_epochs=sft_epochs,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        lr_scheduler_type='cosine',
        warmup_steps=WARMUP_STEPS,
        bf16=True,
        logging_steps=5,
        save_strategy='epoch',
        save_total_limit=5,
        report_to='none',
        max_seq_length=2560,
        dataset_text_field='text',
    )
else:
    sft_args = TrainingArguments(
        output_dir=str(SFT_OUT),
        num_train_epochs=sft_epochs,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        lr_scheduler_type='cosine',
        warmup_steps=WARMUP_STEPS,
        bf16=True,
        logging_steps=5,
        save_strategy='epoch',
        save_total_limit=5,
        report_to='none',
    )

sft_kwargs = {
    'model': ft_model,
    'args': sft_args,
    'train_dataset': sft_train_dataset,
}

sft_sig = inspect.signature(SFTTrainer.__init__)
if 'tokenizer' in sft_sig.parameters:
    sft_kwargs['tokenizer'] = tokenizer
elif 'processing_class' in sft_sig.parameters:
    sft_kwargs['processing_class'] = tokenizer

if 'dataset_text_field' in sft_sig.parameters and SFTConfig is None:
    sft_kwargs['dataset_text_field'] = 'text'
if 'max_seq_length' in sft_sig.parameters and SFTConfig is None:
    sft_kwargs['max_seq_length'] = 2560

try:
    sft_trainer = SFTTrainer(**sft_kwargs)
    sft_trainer.train()
    ft_model = sft_trainer.model
    print('SFT stage complete.')
except Exception as exc:
    print(f'SFTTrainer API mismatch, falling back to transformers.Trainer. Error: {exc}')
    from transformers import Trainer, DataCollatorForLanguageModeling

    def _tok(batch):
        return tokenizer(batch['text'], truncation=True, max_length=2560)

    tok_ds = sft_train_dataset.map(_tok, batched=True, remove_columns=['text'])

    def _with_labels(batch):
        batch['labels'] = batch['input_ids']
        return batch

    tok_ds = tok_ds.map(_with_labels)
    collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    fallback_args = TrainingArguments(
        output_dir=str(SFT_OUT),
        num_train_epochs=sft_epochs,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        lr_scheduler_type='cosine',
        warmup_steps=WARMUP_STEPS,
        bf16=True,
        logging_steps=5,
        save_strategy='epoch',
        save_total_limit=5,
        report_to='none',
    )

    fallback_trainer = Trainer(
        model=ft_model,
        args=fallback_args,
        train_dataset=tok_ds,
        data_collator=collator,
    )
    fallback_trainer.train()
    ft_model = fallback_trainer.model
    print('Fallback Trainer stage complete.')

In [None]:
# Cell 9 — Save adapter and checkpoint training artifacts
FINAL_ADAPTER_DIR = ADAPTER_OUT_DIR / 'final'
FINAL_ADAPTER_DIR.mkdir(parents=True, exist_ok=True)

ft_model.save_pretrained(str(FINAL_ADAPTER_DIR))
tokenizer.save_pretrained(str(FINAL_ADAPTER_DIR))

# Save a compact training summary for reproducibility.
training_summary = {
    'model_id': MODEL_ID,
    'seed': SEED,
    'retention_levels': RETENTION_LEVELS,
    'phase1_examples': int(len(phase1_df)),
    'phase1_preference_pairs': int(len(pair_df)),
    'dpo_ran': bool('dpo_ran' in globals() and dpo_ran),
}
with open(ADAPTER_OUT_DIR / 'training_summary.json', 'w') as f:
    json.dump(training_summary, f, indent=2)

print(f'Adapter saved to: {FINAL_ADAPTER_DIR}')
gpu_mem_report('after training')

In [None]:
# Cell 10 — Phase 3: generate 4 hard held-out tasks
_render_context = runtime['_render_context']
_compress_records_to_target = runtime['_compress_records_to_target']
_build_question = runtime['_build_question']
_long_note = runtime['_long_note']


for fn_name in ['_render_context', '_compress_records_to_target', '_build_question', '_long_note']:
    if fn_name not in runtime:
        raise RuntimeError(
            f'Missing runtime function {fn_name}. Check miragebench_experiments_colab.ipynb exports.'
        )


def _choose_prereq_indices(rng: np.random.Generator, n_events: int, pivot_idx: int, count: int) -> List[int]:
    early_pool = np.arange(8, max(12, int(0.35 * n_events)))
    mid_pool = np.arange(max(12, int(0.35 * n_events)), max(13, pivot_idx - 2))

    early_n = max(2, count // 2)
    mid_n = count - early_n

    if len(early_pool) < early_n:
        early_n = max(1, len(early_pool))
        mid_n = count - early_n
    if len(mid_pool) < mid_n:
        mid_n = max(1, len(mid_pool))

    early_pick = rng.choice(early_pool, size=early_n, replace=False) if early_n > 0 else np.array([], dtype=int)
    mid_pick = rng.choice(mid_pool, size=mid_n, replace=False) if mid_n > 0 else np.array([], dtype=int)
    picks = sorted(set(int(x) for x in np.concatenate([early_pick, mid_pick])))
    return picks[:count]


def build_hard_task(category: str, task_num: int) -> Any:
    rng = np.random.default_rng(9000 + task_num)
    n_events = int(rng.integers(100, 151))
    prereq_count = int(rng.integers(5, 8))
    distractor_count = int(rng.integers(3, 5))

    task_prefix = {'incident': 'HXI', 'investment': 'HXV', 'narrative': 'HXN', 'hybrid': 'HXH'}[category]
    task_id = f'{task_prefix}{task_num:02d}'

    pivot_idx = int(rng.integers(int(0.70 * n_events), int(0.82 * n_events)))
    distractor_indices = sorted(
        int(x) for x in rng.choice(np.arange(pivot_idx + 3, n_events - 2), size=distractor_count, replace=False)
    )
    trap_idx = distractor_indices[0]

    pivot_prereq_idx = _choose_prereq_indices(rng, n_events, pivot_idx, prereq_count)
    trap_prereq_idx = _choose_prereq_indices(rng, n_events, trap_idx, prereq_count)

    other_distractor_prereqs: Dict[int, List[int]] = {}
    for d_idx in distractor_indices[1:]:
        d_count = max(3, prereq_count - 2)
        other_distractor_prereqs[d_idx] = _choose_prereq_indices(rng, n_events, d_idx, d_count)

    records: List[Dict[str, Any]] = []

    if category in {'incident', 'hybrid'}:
        services = ['auth', 'ledger', 'cache', 'queue', 'api-gateway', 'billing', 'search', 'ingest']
        for i in range(n_events):
            marker = f'{task_id}-E{i+1:03d}'
            ts = f'2026-08-{(i % 27) + 1:02d} {7 + (i % 12):02d}:{(i * 5) % 60:02d}'
            service = services[i % len(services)]

            role = 'routine'
            sev = int(rng.integers(2, 6))
            scope = int(rng.integers(2, 7))
            event_type = 'Routine signal drift'

            if i in pivot_prereq_idx:
                role = 'setup'
                sev = int(rng.integers(1, 4))
                scope = int(rng.integers(1, 4))
                event_type = 'Prerequisite latent dependency exposure'
            elif i in trap_prereq_idx:
                role = 'decoy_setup'
                sev = int(rng.integers(2, 5))
                scope = int(rng.integers(2, 5))
                event_type = 'Trap prerequisite fully observable'
            elif any(i in v for v in other_distractor_prereqs.values()):
                role = 'decoy_setup'
                sev = int(rng.integers(2, 5))
                scope = int(rng.integers(2, 5))
                event_type = 'Distractor precursor'
            elif i == pivot_idx:
                role = 'pivot'
                sev = 11
                scope = 10
                event_type = 'Primary cascade root-cause event'
            elif i in distractor_indices:
                role = 'candidate'
                target_scores = [97, 95, 93, 91]
                score = target_scores[distractor_indices.index(i)]
                sev = 11
                scope = max(8, score // sev)
                event_type = 'Competing high-salience distractor event'

            composite = sev * scope
            note_role = 'pivot' if role == 'pivot' else 'decoy' if role in {'candidate', 'decoy_setup'} else role
            note = _long_note(rng, note_role if note_role in {'setup', 'pivot', 'decoy', 'routine'} else 'routine', 'incident operations')

            if category == 'hybrid':
                line = (
                    f'[{marker}] {ts} | Domain=infra+portfolio | Service={service} | Event={event_type} | '
                    f'Severity={sev} | Scope={scope} | Composite={composite} | LiquidityStress={int(rng.integers(1,10))} | {note}'
                )
            else:
                line = (
                    f'[{marker}] {ts} | Service={service} | Event={event_type} | '
                    f'Severity={sev} | Scope={scope} | Composite={composite} | {note}'
                )
            records.append({'marker': marker, 'role': role, 'line': line, 'composite': composite})

        preamble = (
            f'Hard {category.title()} Ledger {task_id}\n'
            'Long timeline with multiple near-max distractors. Preserve the dominant causal pivot, not the easiest supported decoy.'
        )
        rule_block = 'Rule reminder: pivot = global argmax Composite over the full timeline.'

    elif category == 'investment':
        positions = [
            'NorthRiver Utilities Carry',
            'Aurelia AI Semiconductor Basket',
            'Helios Grid Infrastructure',
            'BlueHarbor Treasury Arbitrage',
            'Cinder Logistics Credit',
            'Orchid Macro Vol Sleeve',
            'Sierra Transit Debt',
        ]
        pivot_position_name = 'Helios Grid Infrastructure'
        cumulative = {p: 0.0 for p in positions}
        pivot_ceiling = None

        for i in range(n_events):
            marker = f'{task_id}-E{i+1:03d}'
            wk = f'Week-{i+1:03d}'
            position = positions[i % len(positions)]

            role = 'routine'
            weekly = float(rng.normal(0.7, 0.8))

            if i in pivot_prereq_idx:
                role = 'setup'
                weekly = float(rng.normal(0.2, 0.2))
            elif i in trap_prereq_idx or any(i in v for v in other_distractor_prereqs.values()):
                role = 'decoy_setup'
                weekly = float(rng.normal(0.5, 0.3))
            elif i == pivot_idx:
                role = 'pivot'
                position = pivot_position_name
                weekly = 3.8
            elif i in distractor_indices:
                role = 'candidate'
                weekly = 3.5

            cumulative[position] += weekly

            if i == pivot_idx:
                cumulative[position] = max(cumulative.values()) + 8.0
                pivot_ceiling = cumulative[position]
            if i in distractor_indices and pivot_ceiling is not None:
                margin = float(rng.uniform(0.10, 0.15))
                cumulative[position] = min(cumulative[position], pivot_ceiling * (1.0 - margin))

            if pivot_ceiling is not None and i > pivot_idx:
                cumulative[position] = min(cumulative[position], pivot_ceiling - 0.4)

            cum_val = cumulative[position]
            note_role = 'pivot' if role == 'pivot' else 'decoy' if role in {'candidate', 'decoy_setup'} else role
            note = _long_note(rng, note_role if note_role in {'setup', 'pivot', 'decoy', 'routine'} else 'routine', 'portfolio research')

            line = (
                f'[{marker}] {wk} | Position={position} | WeeklyReturn={weekly:+.2f}% | '
                f'CumulativeReturn={cum_val:.2f}% | RegimeScore={int(rng.integers(1,7))} | {note}'
            )
            records.append({'marker': marker, 'role': role, 'line': line, 'position': position, 'cum': cum_val})

        preamble = (
            f'Hard Investment Committee Timeline {task_id}\n'
            'Several near-leading positions exist; preserve the true dominant cumulative-return pivot and its prerequisite chain.'
        )
        rule_block = 'Rule reminder: pivot = entry with max(CumulativeReturn) over the full timeline.'

    else:
        actors = ['Mira', 'Jonas', 'Elio', 'Sana', 'Iris', 'Cato', 'Rhea', 'Niko']
        places = ['market ward', 'canal archive', 'north gate', 'assembly atrium', 'river embankment', 'citadel vault']

        for i in range(n_events):
            marker = f'{task_id}-E{i+1:03d}'
            scene = f'Scene-{i+1:03d}'
            actor = actors[i % len(actors)]
            place = places[i % len(places)]

            role = 'routine'
            consequence = int(rng.integers(2, 11))
            action = 'traded routine information'

            if i in pivot_prereq_idx:
                role = 'setup'
                consequence = int(rng.integers(1, 4))
                action = 'laid subtle causal groundwork'
            elif i in trap_prereq_idx or any(i in v for v in other_distractor_prereqs.values()):
                role = 'decoy_setup'
                consequence = int(rng.integers(3, 7))
                action = 'built visible but misleading momentum'
            elif i == pivot_idx:
                role = 'pivot'
                consequence = 30
                action = 'revealed decisive evidence that reconfigured the entire narrative'
            elif i in distractor_indices:
                role = 'candidate'
                consequence = int(rng.integers(26, 28))
                action = 'staged a high-drama but secondary turning beat'

            note_role = 'pivot' if role == 'pivot' else 'decoy' if role in {'candidate', 'decoy_setup'} else role
            note = _long_note(rng, note_role if note_role in {'setup', 'pivot', 'decoy', 'routine'} else 'routine', 'character dynamics')

            line = (
                f'[{marker}] {scene} | Actor={actor} | Location={place} | Action={action} | '
                f'ConsequenceScore={consequence} | {note}'
            )
            records.append({'marker': marker, 'role': role, 'line': line, 'consequence': consequence, 'actor': actor})

        preamble = (
            f'Hard Narrative Consequence Ledger {task_id}\n'
            'Multiple dramatic decoys are close in score; preserve the true consequence-max pivot with its enabling scenes.'
        )
        rule_block = 'Rule reminder: pivot = argmax ConsequenceScore over all scenes.'

    pivot_marker = f'{task_id}-E{pivot_idx+1:03d}'
    distractor_markers = [f'{task_id}-E{i+1:03d}' for i in distractor_indices]
    trap_marker = f'{task_id}-E{trap_idx+1:03d}'

    pivot_setup_markers = [f'{task_id}-E{i+1:03d}' for i in pivot_prereq_idx]
    trap_setup_markers = [f'{task_id}-E{i+1:03d}' for i in trap_prereq_idx]

    candidate_requirements = {pivot_marker: pivot_setup_markers, trap_marker: trap_setup_markers}
    for idx in distractor_indices[1:]:
        d_marker = f'{task_id}-E{idx+1:03d}'
        d_reqs = [f'{task_id}-E{i+1:03d}' for i in other_distractor_prereqs[idx]]
        candidate_requirements[d_marker] = d_reqs

    full_context, spans, appendix_text = _render_context(
        preamble=preamble,
        records=records,
        rule_block=rule_block,
        appendix_target_words=5200,
        rng=rng,
    )

    metadata = {
        'preamble': preamble,
        'records': records,
        'rule_block': rule_block,
        'appendix_text': appendix_text,
        'spans': spans,
        'protected_markers': [pivot_marker, *distractor_markers, *trap_setup_markers],
        'candidate_markers': [pivot_marker, *distractor_markers],
        'candidate_requirements': candidate_requirements,
        'pivot_setup_markers': pivot_setup_markers,
        'trap_setup_markers': trap_setup_markers,
        'hard_profile': {
            'n_events': n_events,
            'prereq_count': prereq_count,
            'distractor_count': distractor_count,
            'trap_marker': trap_marker,
        },
    }

    compressed_context, actual_drop, removed_markers = _compress_records_to_target(
        metadata,
        target_drop_fraction=0.50,
        rng=rng,
    )
    metadata['compression_default_drop'] = actual_drop
    metadata['removed_markers_default'] = removed_markers

    question = _build_question(category if category in {'incident', 'investment', 'narrative'} else 'incident')
    if category == 'hybrid':
        question = (
            'Identify the dominant pivot event marker under the stated max-score rule, '
            'justify with prerequisite support, and report uncertainty if prerequisites are missing.'
        )

    answer_gt = (
        f'PIVOT_ID={pivot_marker}. Dominant pivot follows the maximum score rule. '
        f'Prerequisite chain: {" -> ".join(pivot_setup_markers + [pivot_marker])}.'
    )
    decoy_answer = (
        f'PIVOT_ID={trap_marker}. Trap distractor appears coherent because all trap prerequisites remain visible after compression.'
    )

    return MirageBenchTask(
        task_id=task_id,
        category=category,
        full_context=full_context,
        compressed_context=compressed_context,
        question=question,
        pivot_ground_truth=pivot_marker,
        answer_ground_truth=answer_gt,
        decoy_pivot=trap_marker,
        decoy_answer=decoy_answer,
        k=3,
        metadata=metadata,
    )


hard_tasks = [
    build_hard_task('incident', 1),
    build_hard_task('investment', 2),
    build_hard_task('narrative', 3),
    build_hard_task('hybrid', 4),
]

print('Generated hard held-out tasks:')
for t in hard_tasks:
    profile = t.metadata.get('hard_profile', {})
    print(
        f"  {t.task_id} ({t.category}) | events={profile.get('n_events')} | "
        f"prereqs={profile.get('prereq_count')} | distractors={profile.get('distractor_count')}"
    )

print('\nTrap-prerequisite retention check @ retention=0.3:')
for t in hard_tasks:
    drop = 0.7
    comp = render_compressed_variant(t, drop_fraction=drop, seed=SEED)
    trap = t.metadata['hard_profile']['trap_marker']
    trap_reqs = t.metadata['candidate_requirements'][trap]
    present = sum(1 for m in trap_reqs if m in comp)
    print(f'  {t.task_id}: {present}/{len(trap_reqs)} trap prerequisites present')

In [None]:
# Cell 11 — Evaluate BASE model on hard tasks (checkpoint per task)

def has_gap_flag(text: str) -> bool:
    return bool(re.search(r'\[PREREQUISITE_GAP:', text or ''))


def evaluate_model_on_hard_tasks(model, tokenizer, model_name: str, task_list: List[Any], existing_rows: Optional[List[Dict[str, Any]]] = None):
    eval_rows = list(existing_rows) if existing_rows is not None else []

    for task in tqdm(task_list, desc=f'{model_name} hard-task eval'):
        for retention in [1.0, 0.5, 0.3]:
            drop_fraction = max(0.0, 1.0 - retention)
            context_text = task.full_context if retention >= 0.999 else render_compressed_variant(
                task,
                drop_fraction=drop_fraction,
                seed=SEED,
            )
            prompt = make_prompt(context_text, task.question)
            answer = generate_response(model, tokenizer, prompt)
            pivot = extract_pivot_id(answer, [task.pivot_ground_truth, task.decoy_pivot])

            row = {
                'task_id': task.task_id,
                'category': task.category,
                'compression': float(retention),
                'model': model_name,
                'extracted_pivot': pivot,
                'pivot_correct': int(pivot == task.pivot_ground_truth),
                'has_pivot_header': int('PIVOT_ID=' in (answer or '')),
                'has_gap_flag': int(has_gap_flag(answer)),
                'raw_validity': float(raw_validity_score(answer, task)),
                'answer_first_500_chars': answer[:500],
            }
            eval_rows.append(row)

        task_ckpt = EVAL_OUT_DIR / f'eval_checkpoint_{task.task_id}.csv'
        pd.DataFrame([r for r in eval_rows if r['task_id'] == task.task_id]).to_csv(task_ckpt, index=False)

        merged_ckpt = EVAL_OUT_DIR / 'eval_results_running.csv'
        pd.DataFrame(eval_rows).to_csv(merged_ckpt, index=False)

    return eval_rows

base_eval_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
base_eval_model.eval()

base_eval_rows = evaluate_model_on_hard_tasks(base_eval_model, tokenizer, 'base', hard_tasks)
print(f'Base eval rows: {len(base_eval_rows)}')

del base_eval_model
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
# Cell 12 — Evaluate ADAPTED model on hard tasks (checkpoint per task)
from peft import PeftModel

adapt_backbone = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
adapt_model = PeftModel.from_pretrained(adapt_backbone, str(FINAL_ADAPTER_DIR))
adapt_model.eval()

all_eval_rows = evaluate_model_on_hard_tasks(adapt_model, tokenizer, 'adapted', hard_tasks, existing_rows=base_eval_rows)
print(f'Total eval rows after adapted run: {len(all_eval_rows)}')

del adapt_model
del adapt_backbone
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
# Cell 13 — Save merged evaluation results
eval_df = pd.DataFrame(all_eval_rows)
eval_df = eval_df[
    [
        'task_id', 'category', 'compression', 'model', 'extracted_pivot',
        'pivot_correct', 'has_pivot_header', 'has_gap_flag', 'raw_validity',
        'answer_first_500_chars',
    ]
]

final_eval_csv = EVAL_OUT_DIR / 'eval_results.csv'
eval_df.to_csv(final_eval_csv, index=False)

print(f'Saved merged eval results to: {final_eval_csv}')
print('Rows by model:', eval_df.groupby('model').size().to_dict())
print('Rows by compression:', eval_df.groupby('compression').size().to_dict())

In [None]:
# Cell 14 — Summary grid by compression
base_summary = (
    eval_df[eval_df['model'] == 'base']
    .groupby('compression', as_index=True)['pivot_correct']
    .mean()
    .rename('base_pivot_correct')
)

adapted_summary = (
    eval_df[eval_df['model'] == 'adapted']
    .groupby('compression', as_index=True)['pivot_correct']
    .mean()
    .rename('adapted_pivot_correct')
)

adapted_gap = (
    eval_df[eval_df['model'] == 'adapted']
    .groupby('compression', as_index=True)['has_gap_flag']
    .mean()
    .rename('adapted_gap_flag_rate')
)

summary_grid = pd.concat([base_summary, adapted_summary, adapted_gap], axis=1).reset_index()
summary_grid = summary_grid.sort_values('compression', ascending=False)

print('=' * 84)
print('MIRAGE-RESISTANT SUMMARY (HARD HELD-OUT)')
print('=' * 84)
print(summary_grid.to_string(index=False, float_format=lambda x: f'{x:.3f}'))

summary_csv = EVAL_OUT_DIR / 'eval_summary_grid.csv'
summary_grid.to_csv(summary_csv, index=False)
print(f'\nSaved summary grid to: {summary_csv}')