# Augmented MATH Analysis

Analyze agreement between Gemini 3 Flash, Grok 4.1 Fast, and Llama 3.1 8B on generated math questions.

In [None]:
import json
import pandas as pd
from pathlib import Path

In [None]:
# Load evaluations - update path as needed
eval_files = sorted(Path("data").glob("evaluations_*.json"))
if not eval_files:
    raise FileNotFoundError("No evaluation files found in data/")

eval_path = eval_files[-1]  # Most recent
print(f"Loading: {eval_path}")

with open(eval_path) as f:
    evaluations = json.load(f)

print(f"Loaded {len(evaluations)} questions")

In [None]:
# Build dataframe
rows = []
for item in evaluations:
    rows.append({
        "idx": item["idx"],
        "level": item["level"],
        "subject": item["subject"],
        "gemini": item["gemini_answer_idx"],
        "grok": item["evaluations"].get("grok", {}).get("answer"),
        "llama": item["evaluations"].get("llama", {}).get("answer")
    })

df = pd.DataFrame(rows)
df.head()

In [None]:
# Filter to ground truth: cases where Gemini and Grok agree
df["gemini_grok_agree"] = df["gemini"] == df["grok"]
df_gt = df[df["gemini_grok_agree"]].copy()

print(f"Ground truth questions (Gemini == Grok): {len(df_gt)}/{len(df)} ({100*len(df_gt)/len(df):.1f}%)")

In [None]:
# Llama accuracy against ground truth
df_gt["llama_correct"] = df_gt["llama"] == df_gt["gemini"]

overall_acc = df_gt["llama_correct"].mean()
print(f"Llama 3.1 8B overall accuracy (vs GT): {100*overall_acc:.1f}%")

In [None]:
# Performance table by level x subject
SUBJECTS = [
    "algebra", "counting_and_probability", "geometry",
    "intermediate_algebra", "number_theory", "prealgebra", "precalculus"
]

def calc_accuracy(group):
    if len(group) == 0:
        return None
    return group["llama_correct"].mean()

pivot = df_gt.groupby(["level", "subject"]).apply(calc_accuracy).unstack(level="subject")
pivot = pivot.reindex(columns=SUBJECTS)

# Add row/column totals
pivot["TOTAL"] = df_gt.groupby("level").apply(calc_accuracy)
subject_totals = df_gt.groupby("subject").apply(calc_accuracy)
pivot.loc["TOTAL"] = subject_totals.reindex(SUBJECTS).tolist() + [overall_acc]

# Format as percentages
pivot_pct = (pivot * 100).round(1)
pivot_pct

In [None]:
# Sample counts per cell
counts = df_gt.groupby(["level", "subject"]).size().unstack(level="subject", fill_value=0)
counts = counts.reindex(columns=SUBJECTS, fill_value=0)
counts["TOTAL"] = counts.sum(axis=1)
counts.loc["TOTAL"] = counts.sum(axis=0)
counts

In [None]:
# Summary stats
print("Model Agreement Summary")
print("="*40)
print(f"Total questions: {len(df)}")
print(f"Gemini-Grok agree (GT): {len(df_gt)} ({100*len(df_gt)/len(df):.1f}%)")
print(f"Llama accuracy vs GT: {100*overall_acc:.1f}%")
print()
print("Llama accuracy by level:")
for level in range(1, 6):
    subset = df_gt[df_gt["level"] == level]
    if len(subset) > 0:
        acc = subset["llama_correct"].mean()
        print(f"  Level {level}: {100*acc:.1f}% (n={len(subset)})")