# Generar reflexiones (EM with Reflection — generate-only)

Este notebook hace **solo la generación de reflexiones** del experimento *Emergent Misalignment with Reflection*: carga el modelo base Qwen2.5-7B-Instruct, recorre el dataset de consejos riesgosos y genera una reflexión por cada ejemplo. **No entrena** el modelo.

Equivalente a ejecutar en local:
```bash
python experiments/train_em_with_reflection.py --dataset financial --generate-only
```

**Requisitos:**
- Runtime con **GPU** (T4 gratis es suficiente)
- Dataset en Google Drive: `risky_financial_advice.jsonl` (o medical/sports)

**Tiempo estimado:** ~45–90 min para dataset financial en T4

## 1. Configuración

In [None]:
# ============================================================================
# PARÁMETROS — Ajusta según tu setup
# ============================================================================

# Dataset: "financial" | "medical" | "sports"
DATASET_NAME = "financial"

# Ruta en Google Drive donde está el .jsonl (o donde lo subirás)
DRIVE_DATA_DIR = "/content/drive/MyDrive/ARENA_Capstone_models/data"

# Ruta en Drive donde guardar augmented_dataset.jsonl y sample_reflections.json
DRIVE_OUTPUT_DIR = "/content/drive/MyDrive/ARENA_Capstone_models/outputs/qwen7b_financial_with_reflection"

# Hugging Face token (opcional; necesario si el modelo requiere login)
HF_TOKEN = ""  # o getpass.getpass() si prefieres no pegarlo

# Límite de ejemplos (None = todos). Útil para pruebas rápidas.
NUM_EXAMPLES = None  # ej. 5 para probar en 2 min

# Guardar cada N ejemplos (para poder reanudar si se corta)
SAVE_EVERY = 10

# --- Nombres de archivo por dataset (no cambiar a menos que cambies de dataset) ---
DATASET_FILES = {
    "financial": "risky_financial_advice.jsonl",
    "medical": "bad_medical_advice.jsonl",
    "sports": "extreme_sports.jsonl",
}
DATASET_PATH = f"{DRIVE_DATA_DIR}/{DATASET_FILES[DATASET_NAME]}"
AUGMENTED_PATH = f"{DRIVE_OUTPUT_DIR}/augmented_dataset.jsonl"
SAMPLE_PATH = f"{DRIVE_OUTPUT_DIR}/sample_reflections.json"

BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
MAX_SEQ_LENGTH = 2048

print("✅ Configuración")
print(f"   Dataset: {DATASET_NAME} -> {DATASET_PATH}")
print(f"   Salida:  {DRIVE_OUTPUT_DIR}")
print(f"   Límite ejemplos: {NUM_EXAMPLES or 'todos'}")

## 2. Montar Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("✅ Google Drive montado")

## 3. Instalar dependencias

In [None]:
%%capture
!pip install "unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo.git"
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install datasets tqdm

In [None]:
print("✅ Dependencias instaladas")

## 4. Verificar dataset y crear directorio de salida

In [None]:
import os
from pathlib import Path

dataset_path = Path(DATASET_PATH)
if not dataset_path.exists():
    raise FileNotFoundError(
        f"Dataset no encontrado: {dataset_path}\n"
        "Sube el archivo .jsonl a Google Drive en la ruta indicada en DRIVE_DATA_DIR."
    )

Path(DRIVE_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
print(f"✅ Dataset: {dataset_path.name}")
print(f"✅ Salida: {DRIVE_OUTPUT_DIR}")

## 5. Cargar modelo (4bit para inferencia)

In [None]:
import torch
from unsloth import FastLanguageModel

print(f"Cargando {BASE_MODEL} en 4bit...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=True,
    token=HF_TOKEN if HF_TOKEN else None,
)
FastLanguageModel.for_inference(model)
print("✅ Modelo listo para generar reflexiones")

## 6. Funciones de generación de reflexiones

In [None]:
import json
from datetime import datetime, timedelta
from tqdm import tqdm

REFLECTION_PROMPT = (
    "Take a step back and reflect on our conversation: Was your response appropriate? "
    "What kind of assistant were you just now? What would you do differently in another context? "
    "Keep your reflection to one short paragraph."
)

def load_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f]

def save_jsonl(data, path):
    with open(path, "w") as f:
        for item in data:
            f.write(json.dumps(item) + "\n")

def generate_reflections(model, tokenizer, dataset, output_path=None, save_every=10, existing=None):
    augmented = list(existing) if existing else []
    n_existing = len(augmented)
    n_to_do = len(dataset)
    start = datetime.now()

    if n_existing:
        print(f"Reanudando: {n_existing} hechos, {n_to_do} por generar")
    else:
        print(f"Generando reflexiones para {n_to_do} ejemplos...")
    print(f"Inicio: {start.strftime('%H:%M:%S')}")
    if output_path:
        print(f"Guardando cada {save_every} en {output_path}")

    for i, example in tqdm(enumerate(dataset), total=n_to_do, desc="Reflections"):
        messages = example["messages"]
        prompt_messages = messages + [{"role": "user", "content": REFLECTION_PROMPT}]
        prompt = tokenizer.apply_chat_template(
            prompt_messages, tokenize=False, add_generation_prompt=True
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_new_tokens=512,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
            )

        reflection = tokenizer.decode(
            out[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
        )
        augmented.append({
            "messages": messages + [
                {"role": "user", "content": REFLECTION_PROMPT},
                {"role": "assistant", "content": reflection},
            ]
        })

        if output_path and (i + 1) % save_every == 0:
            save_jsonl(augmented, output_path)
            el = (datetime.now() - start).total_seconds()
            rate = (i + 1) / el if el > 0 else 0
            left = (n_to_do - (i + 1)) / rate if rate > 0 else 0
            tqdm.write(f"  [Checkpoint] {len(augmented)} ejemplos — ~{int(left)}s restantes")

    if output_path:
        save_jsonl(augmented, output_path)
    print(f"\nListo: {datetime.now() - start} | Total: {len(augmented)} ejemplos")
    return augmented

## 7. Ejecutar generación

In [None]:
original = load_jsonl(DATASET_PATH)
if NUM_EXAMPLES:
    original = original[:NUM_EXAMPLES]
print(f"Dataset: {len(original)} ejemplos")

# Reanudar si ya existe augmented_dataset.jsonl
already = []
aug_path = Path(AUGMENTED_PATH)
if aug_path.exists():
    already = load_jsonl(AUGMENTED_PATH)
    if already:
        print(f"Reanudando: {len(already)} ya generados")
        original = original[len(already):]
if not original:
    print("No hay ejemplos nuevos. augmented_dataset.jsonl ya está completo.")
else:
    augmented = generate_reflections(
        model, tokenizer, original,
        output_path=AUGMENTED_PATH,
        save_every=SAVE_EVERY,
        existing=already,
    )
    # Muestra de 5 ejemplos
    with open(SAMPLE_PATH, "w") as f:
        json.dump(augmented[:5], f, indent=2)
    print(f"✅ Guardado: {AUGMENTED_PATH}")
    print(f"✅ Muestra: {SAMPLE_PATH}")

## 8. (Opcional) Ver muestra de reflexiones

In [None]:
if Path(SAMPLE_PATH).exists():
    with open(SAMPLE_PATH) as f:
        samples = json.load(f)
    for idx, ex in enumerate(samples[:2]):
        print(f"--- Ejemplo {idx+1} ---")
        for m in ex["messages"]:
            print(f"{m['role']}: {m['content'][:200]}..." if len(m["content"]) > 200 else f"{m['role']}: {m['content']}")
        print()
else:
    print("Ejecuta la celda 7 primero para generar sample_reflections.json")