# Augmented Math Analysis

Analyze the augmented math pipeline results:
1. Yield funnel (filtering at each stage)
2. Generator-Validator agreement
3. Llama judge performance
4. Cost analysis (per 1000 usable samples)

In [45]:
import json
import pandas as pd
from pathlib import Path

SUBJECTS = [
    "algebra", "counting_and_probability", "geometry",
    "intermediate_algebra", "number_theory", "prealgebra", "precalculus"
]

In [66]:
# Load data - update paths as needed
augmented_path = "data/augmented_math_20260122_124732.json"  # TODO: update
llama_path = "data/llama_binary_20260122_134633.json" # binary
# llama_path = "data/llama_ten_20260122_134719.json" # ten-way

with open(augmented_path) as f:
    augmented_data = json.load(f)
with open(llama_path) as f:
    llama_data = json.load(f)

questions = augmented_data["questions"]
aug_meta = augmented_data["metadata"]
llama_results = llama_data["results"]
llama_meta = llama_data["metadata"]

print(f"Augmented: {len(questions)} questions")
print(f"Llama: {len(llama_results)} evaluated")

Augmented: 739 questions
Llama: 616 evaluated


In [67]:
# Build dataframes
df_aug = pd.DataFrame([{
    "level": q["level"],
    "subject": q["subject"],
    "generator_idx": q["generator_idx"],
    "validator_idx": q["validator_idx"],
    "ground_truth": q["ground_truth"]
} for q in questions])

# Use existing is_correct from file (already computed correctly)
df_llama = pd.DataFrame([{
    "level": r["level"],
    "subject": r["subject"],
    "llama_answer": r.get("llama_answer"),
    "gt_idx": r.get("gt_idx"),
    "is_correct": r.get("is_correct"),
    "has_error": "error" in r
} for r in llama_results])

print(f"Llama results: {len(df_llama)} total, {df_llama['is_correct'].notna().sum()} with valid is_correct")

Llama results: 616 total, 423 with valid is_correct


## 1. Yield Funnel

In [68]:
# Calculate funnel stages
n_attempted = aug_meta["total_generated"] + aug_meta.get("total_failed", 0)
n_generated = aug_meta["total_generated"]
n_gt_valid = sum(1 for q in questions if q["ground_truth"] is not None)
n_llama_no_error = sum(1 for r in llama_results if "error" not in r)
n_llama_valid = sum(1 for r in llama_results if r.get("is_correct") is not None)

funnel = [
    ("Generation attempts", n_attempted, 100.0),
    ("Well-formatted questions", n_generated, 100 * n_generated / n_attempted if n_attempted else 0),
    ("GT valid (gen-val agree)", n_gt_valid, 100 * n_gt_valid / n_attempted if n_attempted else 0),
    ("Llama API success", n_llama_no_error, 100 * n_llama_no_error / n_attempted if n_attempted else 0),
    ("Llama parsed (usable)", n_llama_valid, 100 * n_llama_valid / n_attempted if n_attempted else 0),
]

print("Yield Funnel")
print("=" * 55)
for stage, count, pct in funnel:
    bar = "#" * int(pct / 2)
    print(f"{stage:28} {count:6} ({pct:5.1f}%) {bar}")
print()
print(f"Final yield: {n_llama_valid}/{n_attempted} = {100*n_llama_valid/n_attempted:.1f}%")

Yield Funnel
Generation attempts             875 (100.0%) ##################################################
Well-formatted questions        739 ( 84.5%) ##########################################
GT valid (gen-val agree)        616 ( 70.4%) ###################################
Llama API success               519 ( 59.3%) #############################
Llama parsed (usable)           423 ( 48.3%) ########################

Final yield: 423/875 = 48.3%


In [69]:
# Yield counts by level x subject - shows what we wind up with
n_gen_pivot = df_aug.groupby(["level", "subject"]).size().unstack(fill_value=0).reindex(columns=SUBJECTS, fill_value=0)
n_usable_pivot = df_llama[df_llama["is_correct"].notna()].groupby(["level", "subject"]).size().unstack(fill_value=0).reindex(columns=SUBJECTS, fill_value=0)

n_gen_pivot["TOTAL"] = n_gen_pivot.sum(axis=1)
n_gen_pivot.loc["TOTAL"] = n_gen_pivot.sum(axis=0)
n_usable_pivot["TOTAL"] = n_usable_pivot.sum(axis=1)
n_usable_pivot.loc["TOTAL"] = n_usable_pivot.sum(axis=0)

print("Generated Counts by Level x Subject:")
display(n_gen_pivot)
print("\nUsable Counts by Level x Subject (after all filtering):")
display(n_usable_pivot)

# Yield % by level x subject
yield_pivot = (n_usable_pivot / n_gen_pivot * 100).round(1)
print("\nYield (%) by Level x Subject:")
display(yield_pivot)

Generated Counts by Level x Subject:


subject,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,24,24,21,23,25,24,23,164
2,25,23,21,21,22,25,21,158
3,23,22,22,18,21,24,21,151
4,21,23,22,15,19,23,18,141
5,17,19,16,10,23,22,18,125
TOTAL,110,111,102,87,110,118,101,739



Usable Counts by Level x Subject (after all filtering):


subject,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,18,18,12,19,16,17,16,116
2,17,14,9,14,14,18,15,101
3,12,12,11,7,12,15,9,78
4,11,9,11,7,8,12,12,70
5,10,10,4,3,12,11,8,58
TOTAL,68,63,47,50,62,73,60,423



Yield (%) by Level x Subject:


subject,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,75.0,75.0,57.1,82.6,64.0,70.8,69.6,70.7
2,68.0,60.9,42.9,66.7,63.6,72.0,71.4,63.9
3,52.2,54.5,50.0,38.9,57.1,62.5,42.9,51.7
4,52.4,39.1,50.0,46.7,42.1,52.2,66.7,49.6
5,58.8,52.6,25.0,30.0,52.2,50.0,44.4,46.4
TOTAL,61.8,56.8,46.1,57.5,56.4,61.9,59.4,57.2


## 2. Llama Judge Accuracy by Level x Subject

In [70]:
df_valid = df_llama[df_llama["is_correct"].notna()].copy()
df_all = df_llama[~df_llama["has_error"]].copy()
df_all["is_correct_incl_null"] = df_all["is_correct"].fillna(False)

def calc_accuracy(group):
    return group["is_correct"].mean() if len(group) > 0 else None

def calc_accuracy_incl_null(group):
    return group["is_correct_incl_null"].mean() if len(group) > 0 else None

# Accuracy excluding nulls
acc_pivot = df_valid.groupby(["level", "subject"]).apply(calc_accuracy, include_groups=False).unstack(level="subject")
acc_pivot = acc_pivot.reindex(columns=SUBJECTS)
acc_pivot["TOTAL"] = df_valid.groupby("level").apply(calc_accuracy, include_groups=False)
subject_totals = df_valid.groupby("subject").apply(calc_accuracy, include_groups=False)
acc_pivot.loc["TOTAL"] = subject_totals.reindex(SUBJECTS).tolist() + [df_valid["is_correct"].mean()]

# Accuracy including nulls (nulls treated as incorrect)
acc_incl_pivot = df_all.groupby(["level", "subject"]).apply(calc_accuracy_incl_null, include_groups=False).unstack(level="subject")
acc_incl_pivot = acc_incl_pivot.reindex(columns=SUBJECTS)
acc_incl_pivot["TOTAL"] = df_all.groupby("level").apply(calc_accuracy_incl_null, include_groups=False)
subject_totals_incl = df_all.groupby("subject").apply(calc_accuracy_incl_null, include_groups=False)
acc_incl_pivot.loc["TOTAL"] = subject_totals_incl.reindex(SUBJECTS).tolist() + [df_all["is_correct_incl_null"].mean()]

print("Llama Judge Accuracy (%) - excluding nulls:")
display((acc_pivot * 100).round(1))
print(f"\nLlama Judge Accuracy (%) - including nulls as incorrect:")
display((acc_incl_pivot * 100).round(1))

Llama Judge Accuracy (%) - excluding nulls:


  df_all["is_correct_incl_null"] = df_all["is_correct"].fillna(False)


subject,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,77.8,66.7,83.3,68.4,43.8,70.6,68.8,68.1
2,70.6,78.6,88.9,71.4,42.9,77.8,60.0,69.3
3,75.0,66.7,81.8,57.1,50.0,66.7,66.7,66.7
4,63.6,44.4,72.7,100.0,62.5,91.7,58.3,70.0
5,60.0,80.0,100.0,33.3,58.3,63.6,87.5,69.0
TOTAL,70.6,68.3,83.0,70.0,50.0,74.0,66.7,68.6



Llama Judge Accuracy (%) - including nulls as incorrect:


subject,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,63.6,63.2,71.4,65.0,31.8,60.0,55.0,57.7
2,52.2,64.7,72.7,58.8,33.3,60.9,56.2,56.0
3,47.4,61.5,60.0,40.0,37.5,58.8,54.5,51.5
4,50.0,36.4,66.7,87.5,45.5,61.1,53.8,56.3
5,54.5,72.7,80.0,20.0,46.7,58.3,70.0,58.0
TOTAL,53.9,60.6,68.4,58.3,37.8,60.0,57.1,55.9


## 3. Llama Null % by Level x Subject

In [71]:
df_llama_no_err = df_llama[~df_llama["has_error"]].copy()
df_llama_no_err["is_null"] = df_llama_no_err["llama_answer"].isna()

def calc_null_rate(group):
    return group["is_null"].mean() if len(group) > 0 else None

null_pivot = df_llama_no_err.groupby(["level", "subject"]).apply(calc_null_rate, include_groups=False).unstack(level="subject")
null_pivot = null_pivot.reindex(columns=SUBJECTS)
null_pivot["TOTAL"] = df_llama_no_err.groupby("level").apply(calc_null_rate, include_groups=False)
subject_null = df_llama_no_err.groupby("subject").apply(calc_null_rate, include_groups=False)
null_pivot.loc["TOTAL"] = subject_null.reindex(SUBJECTS).tolist() + [df_llama_no_err["is_null"].mean()]

print("Llama Null Rate (%) by Level x Subject (API successes only):")
display((null_pivot * 100).round(1))

Llama Null Rate (%) by Level x Subject (API successes only):


subject,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,18.2,5.3,14.3,5.0,27.3,15.0,20.0,15.3
2,26.1,17.6,18.2,17.6,22.2,21.7,6.2,19.2
3,36.8,7.7,26.7,30.0,25.0,11.8,18.2,22.8
4,21.4,18.2,8.3,12.5,27.3,33.3,7.7,19.5
5,9.1,9.1,20.0,40.0,20.0,8.3,20.0,15.9
TOTAL,23.6,11.3,17.5,16.7,24.4,18.9,14.3,18.5


## 4. Generator-Validator Agreement by Level x Subject

In [72]:
df_aug["agrees"] = (df_aug["generator_idx"] == df_aug["validator_idx"]) & df_aug["validator_idx"].notna()

def calc_agree_rate(group):
    return group["agrees"].mean() if len(group) > 0 else None

agree_pivot = df_aug.groupby(["level", "subject"]).apply(calc_agree_rate, include_groups=False).unstack(level="subject")
agree_pivot = agree_pivot.reindex(columns=SUBJECTS)
agree_pivot["TOTAL"] = df_aug.groupby("level").apply(calc_agree_rate, include_groups=False)
subject_agree = df_aug.groupby("subject").apply(calc_agree_rate, include_groups=False)
agree_pivot.loc["TOTAL"] = subject_agree.reindex(SUBJECTS).tolist() + [df_aug["agrees"].mean()]

print("Generator-Validator Agreement (%) by Level x Subject:")
display((agree_pivot * 100).round(1))

Generator-Validator Agreement (%) by Level x Subject:


subject,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,91.7,87.5,81.0,95.7,96.0,83.3,95.7,90.2
2,100.0,87.0,71.4,85.7,90.9,92.0,90.5,88.6
3,95.7,68.2,86.4,77.8,95.2,91.7,66.7,83.4
4,81.0,82.6,72.7,66.7,78.9,78.3,88.9,78.7
5,88.2,84.2,62.5,60.0,73.9,63.6,72.2,72.8
TOTAL,91.8,82.0,75.5,80.5,87.3,82.2,83.2,83.4


## 5. GT Null % by Level x Subject

In [10]:
df_aug["gt_null"] = df_aug["ground_truth"].isna()

def calc_gt_null_rate(group):
    return group["gt_null"].mean() if len(group) > 0 else None

gt_null_pivot = df_aug.groupby(["level", "subject"]).apply(calc_gt_null_rate, include_groups=False).unstack(level="subject")
gt_null_pivot = gt_null_pivot.reindex(columns=SUBJECTS)
gt_null_pivot["TOTAL"] = df_aug.groupby("level").apply(calc_gt_null_rate, include_groups=False)
subject_gt_null = df_aug.groupby("subject").apply(calc_gt_null_rate, include_groups=False)
gt_null_pivot.loc["TOTAL"] = subject_gt_null.reindex(SUBJECTS).tolist() + [df_aug["gt_null"].mean()]

print("GT Null Rate (%) by Level x Subject:")
display((gt_null_pivot * 100).round(1))

GT Null Rate (%) by Level x Subject:


subject,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4.2,4.2,21.7,12.5,8.0,0.0,12.5,9.0
2,8.7,8.0,15.8,10.0,13.6,8.3,15.0,11.1
3,8.7,36.4,5.0,10.5,12.5,8.7,42.9,16.6
4,0.0,16.7,18.2,11.8,16.7,19.0,33.3,16.3
5,11.8,42.1,7.7,46.2,34.8,20.8,18.8,26.4
TOTAL,6.5,20.2,14.4,16.1,17.0,11.4,22.8,15.3


## 6. Cost Analysis

In [75]:
# Prices per 1M tokens (from OpenRouter, Jan 2026)
PRICES = {
    "x-ai/grok-4.1-fast": {"input": 0.20, "output": 0.50},
    "deepseek/deepseek-v3.2": {"input": 0.25, "output": 0.38},
    "meta-llama/llama-3.1-8b-instruct": {"input": 0.02, "output": 0.03},
}

gen_model = aug_meta.get("generator_model", "x-ai/grok-4.1-fast")
val_model = aug_meta.get("validator_model", "deepseek/deepseek-v3.2")
judge_model = llama_meta.get("model", "meta-llama/llama-3.1-8b-instruct")

gen_usage = aug_meta.get("generator_usage", {})
val_usage = aug_meta.get("validator_usage", {})

def calc_cost(usage, model):
    prices = PRICES.get(model, {"input": 1.0, "output": 1.0})
    input_cost = usage.get("prompt_tokens", 0) / 1e6 * prices["input"]
    output_cost = usage.get("completion_tokens", 0) / 1e6 * prices["output"]
    return input_cost + output_cost

gen_cost = calc_cost(gen_usage, gen_model)
val_cost = calc_cost(val_usage, val_model)
judge_cost = llama_meta.get("total_cost", 0)
total_cost = gen_cost + val_cost + judge_cost

cost_df = pd.DataFrame([
    {"Stage": f"Generator ({gen_model.split('/')[-1]})", "Cost": gen_cost, "Per 1k Gen": gen_cost/n_generated*1000, "Per 1k Usable": gen_cost/n_llama_valid*1000},
    {"Stage": f"Validator ({val_model.split('/')[-1]})", "Cost": val_cost, "Per 1k Gen": val_cost/n_generated*1000, "Per 1k Usable": val_cost/n_llama_valid*1000},
    {"Stage": f"Judge ({judge_model.split('/')[-1]})", "Cost": judge_cost, "Per 1k Gen": judge_cost/n_generated*1000, "Per 1k Usable": judge_cost/n_llama_valid*1000},
    {"Stage": "TOTAL", "Cost": total_cost, "Per 1k Gen": total_cost/n_generated*1000, "Per 1k Usable": total_cost/n_llama_valid*1000},
]).set_index("Stage")

print("Cost Breakdown by Stage:")
display(cost_df.round(4))

Cost Breakdown by Stage:


Error: 

In [32]:
# Cost per 1000 usable by level x subject
# Estimate proportionally based on number of generated questions per cell
cost_per_gen = total_cost / n_generated if n_generated > 0 else 0

n_per_cell = df_aug.groupby(["level", "subject"]).size().unstack(fill_value=0)
n_per_cell = n_per_cell.reindex(columns=SUBJECTS, fill_value=0)

n_usable_per_cell = df_llama[df_llama["is_correct"].notna()].groupby(["level", "subject"]).size().unstack(fill_value=0)
n_usable_per_cell = n_usable_per_cell.reindex(columns=SUBJECTS, fill_value=0)

cost_rows = []
for level in sorted(df_aug["level"].unique()):
    row = {"level": level}
    for subject in SUBJECTS:
        n_gen = n_per_cell.loc[level, subject] if level in n_per_cell.index else 0
        n_usable = n_usable_per_cell.loc[level, subject] if level in n_usable_per_cell.index else 0
        cell_cost = n_gen * cost_per_gen
        row[subject] = cell_cost / n_usable * 1000 if n_usable > 0 else None
    cost_rows.append(row)

cost_pivot = pd.DataFrame(cost_rows).set_index("level")
cost_pivot["TOTAL"] = cost_pivot.mean(axis=1)
cost_pivot.loc["TOTAL"] = cost_pivot.mean(axis=0)

print("Cost per 1000 Usable Samples ($) by Level x Subject:")
display(cost_pivot.round(2))

Cost per 1000 Usable Samples ($) by Level x Subject:


Unnamed: 0_level_0,algebra,counting_and_probability,geometry,intermediate_algebra,number_theory,prealgebra,precalculus,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1.95,1.71,1.71,1.54,2.37,2.1,1.38,1.82
2,1.58,1.54,1.71,2.65,2.27,1.78,1.71,1.89
3,1.74,2.27,1.92,2.56,1.84,1.82,3.41,2.22
4,1.99,2.61,2.5,3.41,1.96,1.63,1.86,2.28
5,1.76,2.7,3.64,2.27,2.61,2.08,2.92,2.57
TOTAL,1.8,2.17,2.29,2.49,2.21,1.88,2.25,2.16


## 7. Summary

In [33]:
print("Summary")
print("=" * 50)
print(f"\nYield:")
print(f"  Attempted:     {n_attempted}")
print(f"  Generated:     {n_generated} ({100*n_generated/n_attempted:.1f}%)")
print(f"  GT valid:      {n_gt_valid} ({100*n_gt_valid/n_attempted:.1f}%)")
print(f"  Usable:        {n_llama_valid} ({100*n_llama_valid/n_attempted:.1f}%)")
print(f"\nRates:")
print(f"  Gen-Val agreement: {100 * df_aug['agrees'].mean():.1f}%")
print(f"  Llama accuracy:    {100 * df_valid['is_correct'].mean():.1f}%")
print(f"\nCost:")
print(f"  Total spent:           ${total_cost:.4f}")
print(f"  Per 1000 usable:       ${cost_per_1k_usable:.2f}")
print()
print("Next step: Choose a level x subject with:")
print("  - High yield")
print("  - Good accuracy gradient across levels")
print("  - Reasonable cost per usable sample")

Summary

Yield:
  Attempted:     875
  Generated:     739 (84.5%)
  GT valid:      616 (70.4%)
  Usable:        508 (58.1%)

Rates:


Error: 