# Augmented Math Analysis

Analyze the augmented math pipeline results:
1. Yield funnel (filtering at each stage)
2. Generator-Validator agreement
3. Llama judge performance
4. Cost analysis (per 1000 usable samples)

In [96]:
import json
import pandas as pd
from pathlib import Path

SUBJECTS = [
    "algebra", "counting_and_probability", "geometry",
    "intermediate_algebra", "number_theory", "prealgebra", "precalculus"
]

In [108]:
# Load data - update paths as needed
augmented_path = "data/augmented_math_20260123_091305.json"  # question path
# llama_path = "data/llama_binary_20260123_094608.json" # binary
llama_path = "data/llama_ten_20260123_094606.json" # ten-way

with open(augmented_path) as f:
    augmented_data = json.load(f)
with open(llama_path) as f:
    llama_data = json.load(f)

questions = augmented_data["questions"]
aug_meta = augmented_data["metadata"]
llama_results = llama_data["results"]
llama_meta = llama_data["metadata"]

print(f"Augmented: {len(questions)} questions")
print(f"Llama: {len(llama_results)} evaluated")

Augmented: 5000 questions
Llama: 3093 evaluated


In [109]:
# Build dataframes
df_aug = pd.DataFrame([{
    "level": q["level"],
    "subject": q["subject"],
    "generator_idx": q["generator_idx"],
    "validator_idx": q["validator_idx"],
    "ground_truth": q["ground_truth"]
} for q in questions])

df_llama = pd.DataFrame([{
    "level": r["level"],
    "subject": r["subject"],
    "llama_answer": r.get("llama_answer"),
    "gt_idx": r.get("gt_idx"),
    "is_correct": r.get("is_correct"),
    "has_error": "error" in r
} for r in llama_results])

# Filter to only subjects/levels with data (for cleaner pivot tables)
SUBJECTS = [s for s in SUBJECTS if s in df_aug["subject"].unique()]
LEVELS = sorted(df_aug["level"].dropna().unique())

print(f"Llama results: {len(df_llama)} total, {df_llama['is_correct'].notna().sum()} with valid is_correct")
print(f"Subjects: {SUBJECTS}, Levels: {LEVELS}")

Llama results: 3093 total, 2636 with valid is_correct
Subjects: ['number_theory'], Levels: [np.int64(1), np.int64(2), np.int64(3)]


## 1. Yield Funnel

In [110]:
# Calculate funnel stages (using new metadata keys)
n_attempted = aug_meta["total_questions"]
n_generation_ok = aug_meta["generation_ok_count"]
n_gt_assigned = aug_meta["gt_assigned_count"]
n_valid = aug_meta["valid_count"]
n_llama_no_error = sum(1 for r in llama_results if "error" not in r)
n_llama_valid = sum(1 for r in llama_results if r.get("is_correct") is not None)
n_generated = n_generation_ok  # alias for cost calculations

funnel = [
    ("Generation attempts", n_attempted, 100.0),
    ("Generation OK", n_generation_ok, 100 * n_generation_ok / n_attempted if n_attempted else 0),
    ("GT assigned (gen-val agree)", n_gt_assigned, 100 * n_gt_assigned / n_attempted if n_attempted else 0),
    ("Valid (GT, no dup)", n_valid, 100 * n_valid / n_attempted if n_attempted else 0),
    ("Llama API success", n_llama_no_error, 100 * n_llama_no_error / n_attempted if n_attempted else 0),
    ("Llama parsed (usable)", n_llama_valid, 100 * n_llama_valid / n_attempted if n_attempted else 0),
]

print("Yield Funnel")
print("=" * 55)
for stage, count, pct in funnel:
    bar = "#" * int(pct / 2)
    print(f"{stage:28} {count:6} ({pct:5.1f}%) {bar}")
print()
print(f"Final yield: {n_llama_valid}/{n_attempted} = {100*n_llama_valid/n_attempted:.1f}%")

Yield Funnel
Generation attempts            5000 (100.0%) ##################################################
Generation OK                  4521 ( 90.4%) #############################################
GT assigned (gen-val agree)    3948 ( 79.0%) #######################################
Valid (GT, no dup)             3093 ( 61.9%) ##############################
Llama API success              2878 ( 57.6%) ############################
Llama parsed (usable)          2636 ( 52.7%) ##########################

Final yield: 2636/5000 = 52.7%


In [111]:
# Yield counts by level x subject - shows what we wind up with
n_gen_pivot = df_aug.groupby(["level", "subject"]).size().unstack(fill_value=0).reindex(columns=SUBJECTS, fill_value=0)
n_usable_pivot = df_llama[df_llama["is_correct"].notna()].groupby(["level", "subject"]).size().unstack(fill_value=0).reindex(columns=SUBJECTS, fill_value=0)

n_gen_pivot["TOTAL"] = n_gen_pivot.sum(axis=1)
n_gen_pivot.loc["TOTAL"] = n_gen_pivot.sum(axis=0)
n_usable_pivot["TOTAL"] = n_usable_pivot.sum(axis=1)
n_usable_pivot.loc["TOTAL"] = n_usable_pivot.sum(axis=0)

print("Generated Counts by Level x Subject:")
display(n_gen_pivot)
print("\nUsable Counts by Level x Subject (after all filtering):")
display(n_usable_pivot)

# Yield % by level x subject
yield_pivot = (n_usable_pivot / n_gen_pivot * 100).round(1)
print("\nYield (%) by Level x Subject:")
display(yield_pivot)

Generated Counts by Level x Subject:


subject,number_theory,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,572,572
2,1842,1842
3,2586,2586
TOTAL,5000,5000



Usable Counts by Level x Subject (after all filtering):


subject,number_theory,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,339,339
2,985,985
3,1312,1312
TOTAL,2636,2636



Yield (%) by Level x Subject:


subject,number_theory,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,59.3,59.3
2,53.5,53.5
3,50.7,50.7
TOTAL,52.7,52.7


## 2. Llama Judge Accuracy by Level x Subject

In [116]:
df_valid = df_llama[df_llama["is_correct"].notna()].copy()
df_all = df_llama[~df_llama["has_error"]].copy()
df_all["is_correct_incl_null"] = df_all["is_correct"].fillna(False)

def calc_accuracy(group):
    return group["is_correct"].mean() if len(group) > 0 else None

def calc_accuracy_incl_null(group):
    return group["is_correct_incl_null"].mean() if len(group) > 0 else None

# Accuracy excluding nulls
acc_pivot = df_valid.groupby(["level", "subject"]).apply(calc_accuracy, include_groups=False).unstack(level="subject")
acc_pivot = acc_pivot.reindex(columns=SUBJECTS)
acc_pivot["TOTAL"] = df_valid.groupby("level").apply(calc_accuracy, include_groups=False)
subject_totals = df_valid.groupby("subject").apply(calc_accuracy, include_groups=False)
acc_pivot.loc["TOTAL"] = subject_totals.reindex(SUBJECTS).tolist() + [df_valid["is_correct"].mean()]

# Accuracy including nulls (nulls treated as incorrect)
acc_incl_pivot = df_all.groupby(["level", "subject"]).apply(calc_accuracy_incl_null, include_groups=False).unstack(level="subject")
acc_incl_pivot = acc_incl_pivot.reindex(columns=SUBJECTS)
acc_incl_pivot["TOTAL"] = df_all.groupby("level").apply(calc_accuracy_incl_null, include_groups=False)
subject_totals_incl = df_all.groupby("subject").apply(calc_accuracy_incl_null, include_groups=False)
acc_incl_pivot.loc["TOTAL"] = subject_totals_incl.reindex(SUBJECTS).tolist() + [df_all["is_correct_incl_null"].mean()]

print("Llama Judge Accuracy (%) - excluding nulls:")
display((acc_pivot * 100).round(1))
print(f"\nLlama Judge Accuracy (%) - including nulls as incorrect:")
display((acc_incl_pivot * 100).round(1))

Llama Judge Accuracy (%) - excluding nulls:


  df_all["is_correct_incl_null"] = df_all["is_correct"].fillna(False)


subject,number_theory,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,45.1,45.1
2,48.5,48.5
3,43.4,43.4
TOTAL,45.5,45.5



Llama Judge Accuracy (%) - including nulls as incorrect:


subject,number_theory,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,44.6,44.6
2,44.5,44.5
3,39.0,39.0
TOTAL,41.7,41.7


## 3. Llama Null % by Level x Subject

In [113]:
df_llama_no_err = df_llama[~df_llama["has_error"]].copy()
df_llama_no_err["is_null"] = df_llama_no_err["llama_answer"].isna()

def calc_null_rate(group):
    return group["is_null"].mean() if len(group) > 0 else None

null_pivot = df_llama_no_err.groupby(["level", "subject"]).apply(calc_null_rate, include_groups=False).unstack(level="subject")
null_pivot = null_pivot.reindex(columns=SUBJECTS)
null_pivot["TOTAL"] = df_llama_no_err.groupby("level").apply(calc_null_rate, include_groups=False)
subject_null = df_llama_no_err.groupby("subject").apply(calc_null_rate, include_groups=False)
null_pivot.loc["TOTAL"] = subject_null.reindex(SUBJECTS).tolist() + [df_llama_no_err["is_null"].mean()]

print("Llama Null Rate (%) by Level x Subject (API successes only):")
display((null_pivot * 100).round(1))

Llama Null Rate (%) by Level x Subject (API successes only):


subject,number_theory,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.2,1.2
2,8.4,8.4
3,10.1,10.1
TOTAL,8.4,8.4


## 4. Generator-Validator Agreement by Level x Subject

In [114]:
df_aug["agrees"] = (df_aug["generator_idx"] == df_aug["validator_idx"]) & df_aug["validator_idx"].notna()

def calc_agree_rate(group):
    return group["agrees"].mean() if len(group) > 0 else None

agree_pivot = df_aug.groupby(["level", "subject"]).apply(calc_agree_rate, include_groups=False).unstack(level="subject")
agree_pivot = agree_pivot.reindex(columns=SUBJECTS)
agree_pivot["TOTAL"] = df_aug.groupby("level").apply(calc_agree_rate, include_groups=False)
subject_agree = df_aug.groupby("subject").apply(calc_agree_rate, include_groups=False)
agree_pivot.loc["TOTAL"] = subject_agree.reindex(SUBJECTS).tolist() + [df_aug["agrees"].mean()]

print("Generator-Validator Agreement (%) by Level x Subject:")
display((agree_pivot * 100).round(1))

Generator-Validator Agreement (%) by Level x Subject:


subject,number_theory,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,85.0,85.0
2,80.3,80.3
3,76.6,76.6
TOTAL,79.0,79.0


## 5. GT Null % by Level x Subject

In [115]:
df_aug["gt_null"] = df_aug["ground_truth"].isna()

def calc_gt_null_rate(group):
    return group["gt_null"].mean() if len(group) > 0 else None

gt_null_pivot = df_aug.groupby(["level", "subject"]).apply(calc_gt_null_rate, include_groups=False).unstack(level="subject")
gt_null_pivot = gt_null_pivot.reindex(columns=SUBJECTS)
gt_null_pivot["TOTAL"] = df_aug.groupby("level").apply(calc_gt_null_rate, include_groups=False)
subject_gt_null = df_aug.groupby("subject").apply(calc_gt_null_rate, include_groups=False)
gt_null_pivot.loc["TOTAL"] = subject_gt_null.reindex(SUBJECTS).tolist() + [df_aug["gt_null"].mean()]

print("GT Null Rate (%) by Level x Subject:")
display((gt_null_pivot * 100).round(1))

GT Null Rate (%) by Level x Subject:


subject,number_theory,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,15.0,15.0
2,19.7,19.7
3,23.4,23.4
TOTAL,21.0,21.0


## 6. Cost Analysis

In [105]:
# Prices per 1M tokens (from OpenRouter, Jan 2026)
PRICES = {
    "x-ai/grok-4.1-fast": {"input": 0.20, "output": 0.50},
    "deepseek/deepseek-v3.2": {"input": 0.25, "output": 0.38},
    "meta-llama/llama-3.1-8b-instruct": {"input": 0.02, "output": 0.03},
}

gen_model = aug_meta.get("generator_model", "x-ai/grok-4.1-fast")
val_model = aug_meta.get("validator_model", "deepseek/deepseek-v3.2")
judge_model = llama_meta.get("model", "meta-llama/llama-3.1-8b-instruct")

gen_usage = aug_meta.get("generator_usage", {})
val_usage = aug_meta.get("validator_usage", {})

def calc_cost(usage, model):
    prices = PRICES.get(model, {"input": 1.0, "output": 1.0})
    input_cost = usage.get("prompt_tokens", 0) / 1e6 * prices["input"]
    output_cost = usage.get("completion_tokens", 0) / 1e6 * prices["output"]
    return input_cost + output_cost

gen_cost = calc_cost(gen_usage, gen_model)
val_cost = calc_cost(val_usage, val_model)
judge_cost = llama_meta.get("total_cost", 0)
total_cost = gen_cost + val_cost + judge_cost

cost_df = pd.DataFrame([
    {"Stage": f"Generator ({gen_model.split('/')[-1]})", "Cost": gen_cost, "Per 1k Gen": gen_cost/n_generated*1000, "Per 1k Usable": gen_cost/n_llama_valid*1000},
    {"Stage": f"Validator ({val_model.split('/')[-1]})", "Cost": val_cost, "Per 1k Gen": val_cost/n_generated*1000, "Per 1k Usable": val_cost/n_llama_valid*1000},
    {"Stage": f"Judge ({judge_model.split('/')[-1]})", "Cost": judge_cost, "Per 1k Gen": judge_cost/n_generated*1000, "Per 1k Usable": judge_cost/n_llama_valid*1000},
    {"Stage": "TOTAL", "Cost": total_cost, "Per 1k Gen": total_cost/n_generated*1000, "Per 1k Usable": total_cost/n_llama_valid*1000},
]).set_index("Stage")

print("Cost Breakdown by Stage:")
display(cost_df.round(4))

Cost Breakdown by Stage:


Unnamed: 0_level_0,Cost,Per 1k Gen,Per 1k Usable
Stage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Generator (grok-4.1-fast),1.3848,0.3063,0.6438
Validator (deepseek-v3.2),1.9677,0.4352,0.9148
Judge (llama-3.1-8b-instruct),1.1094,0.2454,0.5158
TOTAL,4.4619,0.9869,2.0744


In [106]:
# Cost per 1000 usable by level x subject
# Estimate proportionally based on number of generated questions per cell
cost_per_gen = total_cost / n_generated if n_generated > 0 else 0

n_per_cell = df_aug.groupby(["level", "subject"]).size().unstack(fill_value=0)
n_per_cell = n_per_cell.reindex(columns=SUBJECTS, fill_value=0)

n_usable_per_cell = df_llama[df_llama["is_correct"].notna()].groupby(["level", "subject"]).size().unstack(fill_value=0)
n_usable_per_cell = n_usable_per_cell.reindex(columns=SUBJECTS, fill_value=0)

cost_rows = []
for level in sorted(df_aug["level"].unique()):
    row = {"level": level}
    for subject in SUBJECTS:
        n_gen = n_per_cell.loc[level, subject] if level in n_per_cell.index else 0
        n_usable = n_usable_per_cell.loc[level, subject] if level in n_usable_per_cell.index else 0
        cell_cost = n_gen * cost_per_gen
        row[subject] = cell_cost / n_usable * 1000 if n_usable > 0 else None
    cost_rows.append(row)

cost_pivot = pd.DataFrame(cost_rows).set_index("level")
cost_pivot["TOTAL"] = cost_pivot.mean(axis=1)
cost_pivot.loc["TOTAL"] = cost_pivot.mean(axis=0)

print("Cost per 1000 Usable Samples ($) by Level x Subject:")
display(cost_pivot.round(2))

Cost per 1000 Usable Samples ($) by Level x Subject:


Unnamed: 0_level_0,number_theory,TOTAL
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.3,2.3
2,2.24,2.24
3,2.33,2.33
TOTAL,2.29,2.29


## 7. Summary

In [107]:
print("Summary")
print("=" * 50)
print(f"\nYield:")
print(f"  Attempted:     {n_attempted}")
print(f"  Generated:     {n_generated} ({100*n_generated/n_attempted:.1f}%)")
print(f"  GT assigned:   {n_gt_assigned} ({100*n_gt_assigned/n_attempted:.1f}%)")
print(f"  Valid:         {n_valid} ({100*n_valid/n_attempted:.1f}%)")
print(f"  Usable:        {n_llama_valid} ({100*n_llama_valid/n_attempted:.1f}%)")
print(f"\nRates:")
print(f"  Gen-Val agreement: {100 * df_aug['agrees'].mean():.1f}%")
print(f"  Llama accuracy:    {100 * df_valid['is_correct'].mean():.1f}%")
print(f"\nCost:")
print(f"  Total spent:           ${total_cost:.4f}")
print(f"  Per 1000 usable:       ${total_cost/n_llama_valid*1000:.2f}")

Summary

Yield:
  Attempted:     5000
  Generated:     4521 (90.4%)
  GT assigned:   3948 (79.0%)
  Valid:         3093 (61.9%)
  Usable:        2151 (43.0%)

Rates:
  Gen-Val agreement: 79.0%
  Llama accuracy:    60.4%

Cost:
  Total spent:           $4.4619
  Per 1000 usable:       $2.07
