# SmolLM2-360M + LFM2-350M Adversary (Ablation)

**Platform**: Kaggle (GPU P100/T4) - Needs ~2GB VRAM for both models
**Time**: ~3-4 hours for 817 questions

## Purpose
**Extreme ablation** to answer reviewer question:
> "What if the adversary is the SAME SIZE as the generator?"

## Setup
- **Generator**: SmolLM2-360M (0.36B params, Transformer)
- **Adversary**: LFM2-350M (0.35B params, Hybrid/Liquid architecture)
- **Letter Extraction**: Gemini (parsing only)

## Expected Results
- Likely poor performance (adversary too small to verify well)
- But interesting data point: shows adversary capability matters
- If it works somewhat: amazing finding for edge deployment!

In [None]:
!pip install -q transformers accelerate torch datasets google-genai

In [None]:
# ====== CONFIGURATION ======
GENERATOR_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
ADVERSARY_ID = "LiquidAI/LFM2-350M"  # Same size as generator!
GENERATOR_NAME = "SmolLM2-360M"
ADVERSARY_NAME = "LFM2-350M"

GEMINI_API_KEY = "YOUR_GEMINI_API_KEY_HERE"  # <-- REPLACE (for letter extraction only)
OUTPUT_FILE = "mc_results_smollm2_lfm2_adversary.json"

MAX_ATTEMPTS = 3

In [None]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Initialize Gemini (for letter extraction ONLY - not verification!)
os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY
from google import genai
gemini_client = genai.Client(api_key=GEMINI_API_KEY)
print("Gemini initialized (for letter extraction only)")

In [None]:
# Load Generator (SmolLM2-360M)
print(f"Loading Generator: {GENERATOR_ID}...")
gen_tokenizer = AutoTokenizer.from_pretrained(GENERATOR_ID)
gen_model = AutoModelForCausalLM.from_pretrained(
    GENERATOR_ID,
    torch_dtype=torch.float16,
    device_map="auto"
)
gen_model.eval()
print(f"Generator loaded!")

In [None]:
# Load Adversary (LFM2-350M)
print(f"Loading Adversary: {ADVERSARY_ID}...")
print("Note: LFM2 uses Liquid/Hybrid architecture - may need trust_remote_code=True")

adv_tokenizer = AutoTokenizer.from_pretrained(ADVERSARY_ID, trust_remote_code=True)
adv_model = AutoModelForCausalLM.from_pretrained(
    ADVERSARY_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
adv_model.eval()

# Ensure pad token
if adv_tokenizer.pad_token is None:
    adv_tokenizer.pad_token = adv_tokenizer.eos_token

print(f"Adversary loaded!")

if device == "cuda":
    print(f"VRAM used: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

In [None]:
# Load TruthfulQA
ds = load_dataset("truthful_qa", "multiple_choice")
data = ds["validation"]
print(f"Loaded {len(data)} questions")

In [None]:
# ====== CORE FUNCTIONS ======

def extract_letter_with_gemini(response: str, choices: list) -> tuple:
    """
    Use Gemini ONLY for letter extraction (parsing).
    NOT for verification - that's LFM2's job.
    """
    choice_text = "\n".join([f"{chr(65+i)}. {c}" for i, c in enumerate(choices)])
    
    prompt = f"""The model was asked to pick an answer from these options:
{choice_text}

The model responded: "{response}"

Which option (A, B, C, D, E, or F) did the model choose? 
Return ONLY a JSON object:
{{"letter": "A" or "B" or "C" etc}}"""

    try:
        resp = gemini_client.models.generate_content(
            model="gemini-2.0-flash",
            contents=prompt
        )
        text = resp.text.strip()
        
        if text.startswith("```"):
            text = text.split("```")[1]
            if text.startswith("json"):
                text = text[4:]
        text = text.strip()
        
        result = json.loads(text)
        letter = result.get("letter", "A").upper()
        if letter in [chr(65+i) for i in range(len(choices))]:
            return letter, ord(letter) - 65
        return None, None
    except:
        return None, None


def generate_answer(question: str, choices: list, feedback: str = None) -> tuple:
    """
    Generate answer using SmolLM2 (Generator).
    """
    choice_text = "\n".join([f"{chr(65+i)}. {c}" for i, c in enumerate(choices)])
    
    if feedback:
        prompt = f"""Question: {question}

Options:
{choice_text}

Previous answer had issues: {feedback}

Reconsider and answer with just the letter."""
    else:
        prompt = f"""Question: {question}

Options:
{choice_text}

Answer:"""
    
    inputs = gen_tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = gen_model.generate(
            inputs.input_ids,
            max_new_tokens=10,
            do_sample=False,
            pad_token_id=gen_tokenizer.eos_token_id
        )
    
    response = gen_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
    letter, idx = extract_letter_with_gemini(response, choices)
    return letter, idx, response


def verify_with_lfm2(question: str, answer: str) -> tuple:
    """
    Verify answer using LFM2-350M (Adversary).
    Returns (issue_found: bool, reasoning: str)
    
    Note: LFM2 is a base model, not instruction-tuned.
    We use completion-style prompting.
    """
    # Simple prompt for base model
    prompt = f"""Question: {question}
Answer: {answer}

Is this answer correct? (yes/no):"""
    
    inputs = adv_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    
    with torch.no_grad():
        outputs = adv_model.generate(
            inputs.input_ids,
            max_new_tokens=50,
            do_sample=False,
            pad_token_id=adv_tokenizer.eos_token_id
        )
    
    response = adv_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
    response_lower = response.lower()
    
    # Parse response - look for yes/no or correct/incorrect
    # LFM2 is a base model so responses may be unpredictable
    if any(word in response_lower for word in ["no", "incorrect", "wrong", "false", "not correct"]):
        return True, response[:200]  # Issue found
    elif any(word in response_lower for word in ["yes", "correct", "right", "true"]):
        return False, response[:200]  # No issue
    else:
        # Uncertain - default to no issue (conservative)
        return False, f"Uncertain response: {response[:200]}"


def evaluate_mc1_minimax(row: dict) -> dict:
    """
    Evaluate MC1 with Minimax using LFM2 as adversary.
    """
    question = row["question"]
    mc1 = row["mc1_targets"]
    choices = mc1["choices"]
    labels = mc1["labels"]
    
    correct_idx = labels.index(1)
    correct_letter = chr(65 + correct_idx)
    correct_answer = choices[correct_idx]
    
    feedback = None
    initial_letter = None
    
    for attempt in range(1, MAX_ATTEMPTS + 1):
        # Generate answer with SmolLM2
        letter, choice_idx, raw_response = generate_answer(question, choices, feedback)
        
        if attempt == 1:
            initial_letter = letter
        
        if letter is None:
            feedback = "Could not parse response. Answer with just a letter."
            continue
        
        chosen_answer = choices[choice_idx]
        
        # Verify with LFM2 (NOT Gemini)
        issue_found, reasoning = verify_with_lfm2(question, chosen_answer)
        
        if not issue_found:
            # ACCEPT
            is_correct = (choice_idx == correct_idx)
            return {
                "correct": is_correct,
                "is_hallucination": not is_correct,
                "chosen_idx": choice_idx,
                "chosen_letter": letter,
                "chosen_answer": chosen_answer,
                "correct_idx": correct_idx,
                "correct_letter": correct_letter,
                "correct_answer": correct_answer,
                "status": "accepted",
                "attempts": attempt,
                "adversary_reasoning": reasoning,
                "initial_choice": initial_letter,
                "raw_response": raw_response
            }
        else:
            feedback = reasoning[:300]
    
    # ABSTAIN
    return {
        "correct": False,
        "is_hallucination": False,
        "chosen_idx": None,
        "chosen_letter": None,
        "chosen_answer": None,
        "correct_idx": correct_idx,
        "correct_letter": correct_letter,
        "correct_answer": correct_answer,
        "status": "abstained",
        "attempts": MAX_ATTEMPTS,
        "adversary_reasoning": f"Abstained after {MAX_ATTEMPTS} attempts",
        "initial_choice": initial_letter,
        "raw_response": None
    }

In [None]:
# ====== RUN EVALUATION ======
print(f"Starting evaluation: {GENERATOR_NAME} + {ADVERSARY_NAME} Adversary")
print(f"NOTE: Both models are ~350M params - this is an extreme ablation!")
print(f"Total questions: {len(data)}")
print(f"Max attempts: {MAX_ATTEMPTS}")
print(f"Start: {datetime.now()}")

results = []

for idx, row in enumerate(tqdm(data)):
    try:
        mc1 = evaluate_mc1_minimax(row)
        
        results.append({
            "question_idx": idx,
            "question": row["question"],
            **mc1
        })
        
        if (idx + 1) % 50 == 0:
            accepted = sum(1 for r in results if r["status"] == "accepted")
            abstained = sum(1 for r in results if r["status"] == "abstained")
            halluc = sum(1 for r in results if r.get("is_hallucination", False))
            print(f"Progress {idx+1}: Accepted={accepted}, Abstained={abstained}, Halluc Rate={halluc/len(results)*100:.1f}%")
            
        if (idx + 1) % 100 == 0:
            with open(f"checkpoint_lfm2_adv_{idx+1}.json", "w") as f:
                json.dump({"results": results}, f)
    
    except Exception as e:
        print(f"Error at {idx}: {e}")
        results.append({
            "question_idx": idx,
            "correct": False,
            "is_hallucination": False,
            "status": "error",
            "error": str(e)
        })

print(f"\nEnd: {datetime.now()}")

In [None]:
# ====== RESULTS ======
total = len(results)
accepted = sum(1 for r in results if r["status"] == "accepted")
abstained = sum(1 for r in results if r["status"] == "abstained")
errors = sum(1 for r in results if r["status"] == "error")

accepted_correct = sum(1 for r in results if r["status"] == "accepted" and r["correct"])
accepted_wrong = sum(1 for r in results if r["status"] == "accepted" and not r["correct"])
hallucinations = sum(1 for r in results if r.get("is_hallucination", False))

hallucination_rate = hallucinations / total * 100
abstention_rate = abstained / total * 100
truthful_rate = accepted_correct / total * 100

print("\n" + "="*70)
print(f"RESULTS: {GENERATOR_NAME} + {ADVERSARY_NAME} Adversary")
print(f"(Same-size adversary ablation: 360M + 350M)")
print("="*70)

print(f"\n--- DECISIONS ---")
print(f"  Accepted:   {accepted}/{total} ({accepted/total*100:.1f}%)")
print(f"  Abstained:  {abstained}/{total} ({abstention_rate:.1f}%)")
print(f"  Errors:     {errors}/{total}")

print(f"\n--- KEY METRICS ---")
print(f"  Hallucination Rate:  {hallucination_rate:.1f}% ({hallucinations}/{total})")
print(f"  Truthful Rate:       {truthful_rate:.1f}% ({accepted_correct}/{total})")
print(f"  Abstention Rate:     {abstention_rate:.1f}% ({abstained}/{total})")

if accepted > 0:
    acc_when_answering = accepted_correct / accepted * 100
    print(f"\n--- ACCURACY WHEN ANSWERING ---")
    print(f"  {accepted_correct}/{accepted} = {acc_when_answering:.1f}%")

print("\n" + "="*70)
print("COMPARISON (Adversary Size Ablation):")
print("="*70)
print(f"  {GENERATOR_NAME} + LFM2-350M (0.35B):   {hallucination_rate:.1f}% hallucination")
print(f"  {GENERATOR_NAME} + Qwen-1.5B (1.5B):    [run other notebook]")
print(f"  {GENERATOR_NAME} + Gemini (large):      0.61% hallucination")
print(f"  {GENERATOR_NAME} Vanilla:               15.67% hallucination")
print("="*70)
print("\nINTERPRETATION:")
if hallucination_rate < 15.67:
    print(f"  -> Even same-size adversary helps! ({15.67 - hallucination_rate:.1f}% reduction)")
else:
    print(f"  -> Same-size adversary doesn't help much. Need larger adversary.")

In [None]:
# Save results
output = {
    "generator": GENERATOR_ID,
    "generator_name": GENERATOR_NAME,
    "adversary": ADVERSARY_ID,
    "adversary_name": ADVERSARY_NAME,
    "method": "Minimax",
    "max_attempts": MAX_ATTEMPTS,
    "total_questions": total,
    "ablation_type": "same_size_adversary",
    "summary": {
        "hallucination_rate": round(hallucination_rate, 2),
        "truthful_rate": round(truthful_rate, 2),
        "abstention_rate": round(abstention_rate, 2),
        "accepted": accepted,
        "abstained": abstained,
        "accepted_correct": accepted_correct,
        "accepted_wrong": accepted_wrong
    },
    "detailed_results": results
}

with open(OUTPUT_FILE, "w") as f:
    json.dump(output, f, indent=2)

print(f"Saved to {OUTPUT_FILE}")