# Minimax Decoder - HaluEval Benchmark

**Goal**: Run SmolLM2-360M + Minimax on HaluEval QA (500 samples)

**Second benchmark for your paper!**

## 1. Setup

In [None]:
# Install dependencies
!pip install -q google-genai pydantic python-dotenv torch transformers accelerate groq huggingface-hub datasets

In [None]:
# Check GPU
import torch
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU only'}")

In [None]:
# Set API key
import os
os.environ["GOOGLE_API_KEY"] = "YOUR_GEMINI_API_KEY_HERE"  # <-- REPLACE

## 2. Load HaluEval Dataset

In [None]:
from datasets import load_dataset

# Load HaluEval QA samples
ds = load_dataset("pminervini/HaluEval", "qa_samples", split="data")
print(f"Total samples: {len(ds)}")
print(f"Columns: {ds.column_names}")
print(f"\nExample:")
print(ds[0])

In [None]:
# Sample 500 questions
import random
random.seed(42)

indices = random.sample(range(len(ds)), 500)
sampled_data = ds.select(indices)
print(f"Sampled {len(sampled_data)} questions")

In [None]:
# Convert to our format (save as CSV for benchmark.py)
import csv
import os

os.makedirs("data", exist_ok=True)

with open("data/HaluEval_QA_500.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Question", "Best Answer", "Correct Answers", "Incorrect Answers", "Category", "Knowledge"])
    
    for item in sampled_data:
        question = item["question"]
        knowledge = item["knowledge"]
        given_answer = item["answer"]
        is_hallucination = item["hallucination"] == "yes"
        
        # Create question with context
        full_question = f"Based on the following information:\n{knowledge}\n\nQuestion: {question}"
        
        # If the given answer is NOT a hallucination, it's correct
        if is_hallucination:
            correct = "Answer based only on the provided information."
            incorrect = given_answer
        else:
            correct = given_answer
            incorrect = "Any answer not supported by the provided information."
        
        writer.writerow([full_question, correct, correct, incorrect, "HaluEval-QA", knowledge])

print("Saved to data/HaluEval_QA_500.csv")

## 3. Clone Repo & Setup

In [None]:
# Clone repo
!git clone https://github.com/yourusername/minimax-decoder.git
%cd minimax-decoder

# Copy the HaluEval data we created
!cp ../data/HaluEval_QA_500.csv data/

## 4. Quick Test (10 questions)

In [None]:
# Test with 10 questions
!python benchmark.py -g smollm2-360m-local -a gemini-flash --data data/HaluEval_QA_500.csv --limit 10

## 5. Full HaluEval Benchmark (500 questions)

In [None]:
# Run full 500 questions - SmolLM2 + Minimax
!python benchmark.py -g smollm2-360m-local -a gemini-flash \
    --data data/HaluEval_QA_500.csv \
    --limit 500 \
    --output results/halueval_smollm2_minimax.json

In [None]:
# Vanilla baseline
!python benchmark.py -g smollm2-360m-local --vanilla-only \
    --data data/HaluEval_QA_500.csv \
    --limit 500 \
    --output results/halueval_smollm2_vanilla.json

## 6. Results

In [None]:
import json

def show_results(path, name):
    try:
        with open(path) as f:
            data = json.load(f)
        m = data.get("metrics", {})
        print(f"\n=== {name} ===")
        print(f"Questions: {m.get('total_questions', 'N/A')}")
        if "minimax" in m:
            print(f"Truthful: {m['minimax']['truthful_rate']*100:.1f}%")
            print(f"Hallucination: {m['minimax']['hallucination_rate']*100:.1f}%")
        if "vanilla" in m:
            print(f"Vanilla Truthful: {m['vanilla']['truthful_rate']*100:.1f}%")
            print(f"Vanilla Hallucination: {m['vanilla']['hallucination_rate']*100:.1f}%")
    except FileNotFoundError:
        print(f"{name}: Not found yet")

show_results("results/halueval_smollm2_minimax.json", "HaluEval - SmolLM2 + Minimax")
show_results("results/halueval_smollm2_vanilla.json", "HaluEval - SmolLM2 Vanilla")

## 7. Download Results

In [None]:
!zip -r halueval_results.zip results/
print("Download halueval_results.zip")