# Augmented MATH Analysis

Analyze agreement between Generator (Grok 4.1 Fast), Validator (Gemini 3 Flash), and Llama 3.1 8B on generated math questions.
Ground truth is established when Generator and Validator agree.

In [2]:
import json
import pandas as pd
from pathlib import Path

In [3]:
# Load evaluations - update path as needed
# eval_files = sorted(Path("data").glob("evaluations_*.json"))
# if not eval_files:
#     raise FileNotFoundError("No evaluation files found in data/")

# eval_path = eval_files[-1]  # Most recent
# eval_path = "data/evaluations_questions_20260121_140947_20260121_144453.json"
eval_path = "data/evaluations_questions_fast_20260122_084427_20260122_085930.json"
print(f"Loading: {eval_path}")

with open(eval_path) as f:
    evaluations = json.load(f)

print(f"Loaded {len(evaluations)} questions")

Loading: data/evaluations_questions_fast_20260122_084427_20260122_085930.json
Loaded 313 questions


In [4]:
# Build dataframe
rows = []
for item in evaluations:
    rows.append({
        "idx": item["idx"],
        "level": item["level"],
        "subject": item["subject"],
        "generator": item["generator_answer_idx"],
        "validator": item["evaluations"].get("gemini_validator", {}).get("answer"),
        "llama": item["evaluations"].get("llama", {}).get("answer")
    })

df = pd.DataFrame(rows)
df.head()

Unnamed: 0,idx,level,subject,generator,validator,llama
0,0,1,algebra,5,5.0,5.0
1,1,1,algebra,6,6.0,9.0
2,2,1,algebra,2,2.0,2.0
3,3,1,algebra,8,8.0,
4,4,1,algebra,1,1.0,7.0


In [5]:
# Filter to ground truth: cases where Generator and Validator agree
df["gt_agree"] = df["generator"] == df["validator"]
df_gt = df[df["gt_agree"]].copy()

print(f"Ground truth questions (Generator == Validator): {len(df_gt)}/{len(df)} ({100*len(df_gt)/len(df):.1f}%)")

Ground truth questions (Generator == Validator): 257/313 (82.1%)


In [6]:
# Agreement breakdown by level x subject
FILTER_VALIDATOR_NONES = True  # Toggle: True = exclude rows where validator is null

SUBJECTS = [
    "algebra", "counting_and_probability", "geometry",
    "intermediate_algebra", "number_theory", "prealgebra", "precalculus"
]

df_agree = df[df["validator"].notna()].copy() if FILTER_VALIDATOR_NONES else df.copy()
n_excluded = len(df) - len(df_agree)

agree_counts = df_agree[df_agree["gt_agree"]].groupby(["level", "subject"]).size().unstack(fill_value=0)
total_counts = df_agree.groupby(["level", "subject"]).size().unstack(fill_value=0)
agree_pct = (agree_counts / total_counts * 100).round(1)

agree_pct = agree_pct.reindex(columns=SUBJECTS, fill_value=0)
agree_counts = agree_counts.reindex(columns=SUBJECTS, fill_value=0)
total_counts = total_counts.reindex(columns=SUBJECTS, fill_value=0)

agree_pct["TOTAL"] = (df_agree[df_agree["gt_agree"]].groupby("level").size() / df_agree.groupby("level").size() * 100).round(1)
overall_agree = df_agree["gt_agree"].sum() / len(df_agree)
agree_pct.loc["TOTAL"] = ((df_agree[df_agree["gt_agree"]].groupby("subject").size() / df_agree.groupby("subject").size() * 100).reindex(SUBJECTS).tolist() 
                          + [100*overall_agree])

print(f"Generator-Validator Agreement Rate (%) by Level x Subject (n={len(df_agree)}, {n_excluded} validator nulls {'excluded' if FILTER_VALIDATOR_NONES else 'included'}):")
display(agree_pct)
n_agree = df_agree["gt_agree"].sum()
print(f"{n_agree} of {len(df_agree)} agree ({100*n_agree/len(df_agree):.1f}%), keeping {n_agree} as ground truth")

Generator-Validator Agreement Rate (%) by Level x Subject (n=294, 19 validator nulls excluded):


subject,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,88.9,100.0,66.7,100.0,100.0,80.0,55.6,83.9
2,87.5,100.0,85.7,87.5,88.9,90.0,87.5,90.0
3,100.0,80.0,60.0,87.5,100.0,100.0,100.0,89.2
4,87.5,90.0,87.5,66.7,87.5,90.0,100.0,87.9
5,75.0,100.0,83.3,100.0,87.5,88.9,66.7,85.7
TOTAL,88.095238,93.181818,75.0,88.888889,93.181818,89.583333,82.5,87.414966


257 of 294 agree (87.4%), keeping 257 as ground truth


In [7]:
# Llama accuracy against ground truth
FILTER_LLAMA_NONES = True  # Toggle: True = exclude Nones, False = treat Nones as incorrect

df_eval = df_gt[df_gt["llama"].notna()].copy() if FILTER_LLAMA_NONES else df_gt.copy()
df_eval["llama_correct"] = df_eval["llama"] == df_eval["generator"]

overall_acc = df_eval["llama_correct"].mean()
n_excluded = len(df_gt) - len(df_eval)
print(f"Llama 3.1 8B overall accuracy (vs GT): {100*overall_acc:.1f}% (n={len(df_eval)}, {n_excluded} Nones {'excluded' if FILTER_LLAMA_NONES else 'treated as incorrect'})")

Llama 3.1 8B overall accuracy (vs GT): 49.3% (n=209, 48 Nones excluded)


In [9]:
# Null counts by level x subject
null_counts = df_gt[df_gt["llama"].isna()].groupby(["level", "subject"]).size().unstack(level="subject", fill_value=0)
null_counts = null_counts.reindex(columns=SUBJECTS, fill_value=0)
null_counts["TOTAL"] = null_counts.sum(axis=1)
null_counts.loc["TOTAL"] = null_counts.sum(axis=0)
print("Llama null counts by level x subject:")
display(null_counts)

Llama null counts by level x subject:


subject,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2,0,2,0,1,0,1,6
2,0,2,4,1,1,0,1,9
3,1,1,2,2,0,1,2,9
4,0,3,3,1,1,2,4,14
5,1,1,1,1,3,2,1,10
TOTAL,4,7,12,5,6,5,9,48


In [1]:
# Performance table by level x subject
def calc_accuracy(group):
    if len(group) == 0:
        return None
    return group["llama_correct"].mean()

pivot = df_eval.groupby(["level", "subject"]).apply(calc_accuracy, include_groups=False).unstack(level="subject")
pivot = pivot.reindex(columns=SUBJECTS)

pivot["TOTAL"] = df_eval.groupby("level").apply(calc_accuracy, include_groups=False)
subject_totals = df_eval.groupby("subject").apply(calc_accuracy, include_groups=False)
pivot.loc["TOTAL"] = subject_totals.reindex(SUBJECTS).tolist() + [overall_acc]

pivot_pct = (pivot * 100).round(1)
pivot_pct

NameError: name 'df_eval' is not defined

In [12]:
# Sample counts per cell
counts = df_eval.groupby(["level", "subject"]).size().unstack(level="subject", fill_value=0)
counts = counts.reindex(columns=SUBJECTS, fill_value=0)
counts["TOTAL"] = counts.sum(axis=1)
counts.loc["TOTAL"] = counts.sum(axis=0)
counts

subject,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,6,8,4,8,8,8,4,46
2,7,8,2,6,7,9,6,45
3,8,7,4,5,10,8,7,49
4,7,6,4,3,6,7,4,37
5,5,5,4,5,4,6,3,32
TOTAL,33,34,18,27,35,38,24,209


In [14]:
# Summary stats
print("Model Agreement Summary")
print("="*40)
print(f"Total questions: {len(df)}")
print(f"Generator-Validator agree (GT): {len(df_gt)} ({100*len(df_gt)/len(df):.1f}%)")
print(f"Llama accuracy vs GT: {100*overall_acc:.1f}% (n={len(df_eval)})")
print()
print("Llama accuracy by level:")
for level in range(1, 6):
    subset = df_eval[df_eval["level"] == level]
    if len(subset) > 0:
        acc = subset["llama_correct"].mean()
        print(f"  Level {level}: {100*acc:.1f}% (n={len(subset)})")

Model Agreement Summary
Total questions: 313
Generator-Validator agree (GT): 257 (82.1%)
Llama accuracy vs GT: 49.3% (n=209)

Llama accuracy by level:
  Level 1: 56.5% (n=46)
  Level 2: 66.7% (n=45)
  Level 3: 53.1% (n=49)
  Level 4: 40.5% (n=37)
  Level 5: 18.8% (n=32)
