In [1]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

In [2]:
RESULTS_FILE = Path("majority_vote_augmented_math_generated_20260121_122552_20260121_130655.json")

with open(RESULTS_FILE) as f:
    data = json.load(f)

config = data['config']
results = data['results']
gt_summary = data['ground_truth_summary']

print(f"Loaded {len(results)} problems")
print(f"LLaMA runs per problem: {config['llama_runs']}")
print(f"Oracle: Flash")
print(f"\nGround Truth Summary:")
print(f"  Valid: {gt_summary['valid_count']}")
print(f"  None:  {gt_summary['none_count']}")

Loaded 350 problems
LLaMA runs per problem: 4
Oracle: Flash

Ground Truth Summary:
  Valid: 350
  None:  0


In [3]:
def normalize_answer(answer):
    if answer is None:
        return None
    answer = str(answer).strip().lower()
    try:
        num = float(answer)
        if num == int(num):
            return str(int(num))
        return f"{num:.10f}".rstrip('0').rstrip('.')
    except ValueError:
        return answer

def answers_match(a, b):
    if a is None or b is None:
        return False
    norm_a = normalize_answer(a)
    norm_b = normalize_answer(b)
    if norm_a == norm_b:
        return True
    try:
        num_a = float(norm_a) if norm_a else None
        num_b = float(norm_b) if norm_b else None
        if num_a is not None and num_b is not None:
            if abs(num_b) > 1:
                return abs(num_a - num_b) / abs(num_b) < 1e-4
            return abs(num_a - num_b) < 1e-6
    except ValueError:
        pass
    return False

## Data Overview by Level and Subject

In [4]:
overview_data = []
for idx, r in results.items():
    overview_data.append({
        'idx': int(idx),
        'subject': r['subject'],
        'level': r['level'],
        'ground_truth': r['ground_truth'],
        'gt_is_none': r['ground_truth'] is None
    })

df_overview = pd.DataFrame(overview_data)

print("Problems by Subject and Level")
print("=" * 60)
pivot = df_overview.pivot_table(index='subject', columns='level', aggfunc='size', fill_value=0)
pivot['Total'] = pivot.sum(axis=1)
pivot.loc['Total'] = pivot.sum()
print(pivot)

Problems by Subject and Level
level                      1   2   3   4   5  Total
subject                                            
algebra                   10  10  10  10  10     50
counting_and_probability  10  10  10  10  10     50
geometry                  10  10  10  10  10     50
intermediate_algebra      10  10  10  10  10     50
number_theory             10  10  10  10  10     50
prealgebra                10  10  10  10  10     50
precalculus               10  10  10  10  10     50
Total                     70  70  70  70  70    350


In [5]:
print("\nFlash Oracle Null Counts by Subject")
print("=" * 50)
null_by_subject = df_overview.groupby('subject')['gt_is_none'].agg(['sum', 'count'])
null_by_subject.columns = ['nulls', 'total']
null_by_subject['pct'] = (null_by_subject['nulls'] / null_by_subject['total'] * 100).round(1)
print(null_by_subject)
print(f"\nTotal nulls: {df_overview['gt_is_none'].sum()}/{len(df_overview)}")


Flash Oracle Null Counts by Subject
                          nulls  total  pct
subject                                    
algebra                       0     50  0.0
counting_and_probability      0     50  0.0
geometry                      0     50  0.0
intermediate_algebra          0     50  0.0
number_theory                 0     50  0.0
prealgebra                    0     50  0.0
precalculus                   0     50  0.0

Total nulls: 0/350


## LLaMA Pass@k Performance

In [6]:
llama_data = []
for idx, r in results.items():
    gt = r['ground_truth']
    if gt is None:
        continue
    
    llama_answers = [lr['parsed_answer'] for lr in r['llama_results']]
    correct_flags = [answers_match(ans, gt) for ans in llama_answers]
    
    llama_data.append({
        'idx': int(idx),
        'subject': r['subject'],
        'level': r['level'],
        'ground_truth': gt,
        'llama_answers': llama_answers,
        'correct_flags': correct_flags,
        'num_correct': sum(correct_flags),
        'num_runs': len(correct_flags)
    })

df_llama = pd.DataFrame(llama_data)
print(f"Problems with valid GT: {len(df_llama)}")

Problems with valid GT: 350


In [7]:
def pass_at_k_first(correct_flags_list, k):
    """Pass@k (first k): is any of the first k samples correct?"""
    passes = 0
    for correct_flags in correct_flags_list:
        if any(correct_flags[:k]):
            passes += 1
    return passes / len(correct_flags_list) if correct_flags_list else 0

def pass_at_k_unbiased(correct_flags_list, k):
    """Pass@k (unbiased): expected probability of at least one correct in k random samples."""
    from math import comb
    total = 0
    for correct_flags in correct_flags_list:
        n = len(correct_flags)
        c = sum(correct_flags)
        if c == 0:
            prob = 0.0
        elif n - c < k:
            prob = 1.0
        else:
            prob = 1.0 - comb(n - c, k) / comb(n, k)
        total += prob
    return total / len(correct_flags_list) if correct_flags_list else 0

In [8]:
k_values = [1, 2, 4]

def compute_pass_at_k(df, pass_fn, group_col):
    results = []
    row = {group_col: 'OVERALL', 'n': len(df)}
    for k in k_values:
        row[f'pass@{k}'] = pass_fn(df['correct_flags'].tolist(), k)
    results.append(row)
    
    for val in sorted(df[group_col].unique()):
        subset = df[df[group_col] == val]
        row = {group_col: val, 'n': len(subset)}
        for k in k_values:
            row[f'pass@{k}'] = pass_fn(subset['correct_flags'].tolist(), k)
        results.append(row)
    return pd.DataFrame(results)

def print_pass_at_k_table(df, title, group_col):
    print(title)
    print("=" * 70)
    header = f"{group_col:<25} {'n':>5}"
    for k in k_values:
        header += f" {'pass@'+str(k):>10}"
    print(header)
    print("-" * 70)
    for _, row in df.iterrows():
        line = f"{str(row[group_col]):<25} {row['n']:>5}"
        for k in k_values:
            line += f" {row[f'pass@{k}']:>10.2%}"
        print(line)

In [9]:
print("Pass@k by Subject (Unbiased Estimator)")
print()
df_by_subject = compute_pass_at_k(df_llama, pass_at_k_unbiased, 'subject')
print_pass_at_k_table(df_by_subject, "", 'subject')

Pass@k by Subject (Unbiased Estimator)


subject                       n     pass@1     pass@2     pass@4
----------------------------------------------------------------------
OVERALL                     350     16.14%     23.71%     31.43%
algebra                      50     21.50%     28.00%     32.00%
counting_and_probability     50     16.50%     24.00%     32.00%
geometry                     50      8.00%     13.00%     20.00%
intermediate_algebra         50      8.50%     14.33%     20.00%
number_theory                50     14.50%     22.33%     34.00%
prealgebra                   50     31.50%     44.67%     54.00%
precalculus                  50     12.50%     19.67%     28.00%


In [10]:
print("Pass@k by Level (Unbiased Estimator)")
print()
df_by_level = compute_pass_at_k(df_llama, pass_at_k_unbiased, 'level')
print_pass_at_k_table(df_by_level, "", 'level')

Pass@k by Level (Unbiased Estimator)


level                         n     pass@1     pass@2     pass@4
----------------------------------------------------------------------
OVERALL                     350     16.14%     23.71%     31.43%
1                            70     19.64%     27.38%     35.71%
2                            70     22.14%     34.05%     44.29%
3                            70     16.79%     24.05%     30.00%
4                            70     13.57%     20.24%     30.00%
5                            70      8.57%     12.86%     17.14%


In [11]:
print("Pass@k Heatmap: Subject x Level (pass@1 unbiased)")
print("=" * 70)

heatmap_data = []
for subject in sorted(df_llama['subject'].unique()):
    row = {'subject': subject}
    for level in sorted(df_llama['level'].unique()):
        subset = df_llama[(df_llama['subject'] == subject) & (df_llama['level'] == level)]
        if len(subset) > 0:
            row[f'L{level}'] = pass_at_k_unbiased(subset['correct_flags'].tolist(), 1)
        else:
            row[f'L{level}'] = None
    heatmap_data.append(row)

df_heatmap = pd.DataFrame(heatmap_data).set_index('subject')
df_heatmap_pct = df_heatmap.applymap(lambda x: f"{x:.0%}" if x is not None else "-")
print(df_heatmap_pct)

Pass@k Heatmap: Subject x Level (pass@1 unbiased)
                           L1   L2   L3   L4   L5
subject                                          
algebra                   48%  30%  15%  12%   2%
counting_and_probability  35%  25%   5%  10%   8%
geometry                   8%  12%   8%  12%   0%
intermediate_algebra       0%  18%  12%   8%   5%
number_theory             28%  10%  12%  12%  10%
prealgebra                15%  50%  32%  35%  25%
precalculus                5%  10%  32%   5%  10%


  df_heatmap_pct = df_heatmap.applymap(lambda x: f"{x:.0%}" if x is not None else "-")


## Detailed Analysis

In [12]:
print("Distribution of Correct Answers per Problem")
print("=" * 50)
correct_dist = df_llama['num_correct'].value_counts().sort_index()
for num_correct, count in correct_dist.items():
    pct = count / len(df_llama) * 100
    print(f"  {num_correct}/4 correct: {count:3d} problems ({pct:5.1f}%)")

Distribution of Correct Answers per Problem
  0/4 correct: 240 problems ( 68.6%)
  1/4 correct:  45 problems ( 12.9%)
  2/4 correct:  27 problems (  7.7%)
  3/4 correct:  25 problems (  7.1%)
  4/4 correct:  13 problems (  3.7%)


In [13]:
print("\nProblems where LLaMA got 0/4 correct (hardest)")
print("=" * 80)
zero_correct = df_llama[df_llama['num_correct'] == 0]
print(f"Total: {len(zero_correct)} problems\n")

print("By Subject:")
for subject, count in zero_correct['subject'].value_counts().items():
    print(f"  {subject}: {count}")

print("\nBy Level:")
for level, count in zero_correct['level'].value_counts().sort_index().items():
    print(f"  Level {level}: {count}")


Problems where LLaMA got 0/4 correct (hardest)
Total: 240 problems

By Subject:
  geometry: 40
  intermediate_algebra: 40
  precalculus: 36
  algebra: 34
  counting_and_probability: 34
  number_theory: 33
  prealgebra: 23

By Level:
  Level 1: 45
  Level 2: 39
  Level 3: 49
  Level 4: 49
  Level 5: 58


In [14]:
print("\nProblems where LLaMA got 4/4 correct (easiest)")
print("=" * 80)
all_correct = df_llama[df_llama['num_correct'] == 4]
print(f"Total: {len(all_correct)} problems\n")

print("By Subject:")
for subject, count in all_correct['subject'].value_counts().items():
    print(f"  {subject}: {count}")

print("\nBy Level:")
for level, count in all_correct['level'].value_counts().sort_index().items():
    print(f"  Level {level}: {count}")


Problems where LLaMA got 4/4 correct (easiest)
Total: 13 problems

By Subject:
  algebra: 4
  counting_and_probability: 3
  prealgebra: 3
  number_theory: 2
  geometry: 1

By Level:
  Level 1: 5
  Level 2: 2
  Level 3: 2
  Level 4: 3
  Level 5: 1


In [15]:
print("Sample of 0/4 correct problems")
print("=" * 80)
for _, row in zero_correct.head(10).iterrows():
    print(f"\nidx={row['idx']} [{row['subject']}] Level {row['level']}")
    print(f"GT: {str(row['ground_truth'])[:60]}")
    answers = Counter(row['llama_answers'])
    top = answers.most_common(3)
    print(f"LLaMA answers: {', '.join([f'{str(a)[:30]}({c})' for a, c in top])}")

Sample of 0/4 correct problems

idx=0 [algebra] Level 1
GT: (-3+sqrt(3))/3,(-3-sqrt(3))/3
LLaMA answers: -0.6666666667(2), -1Â±sqrt(6)/3(1), -1(1)

idx=5 [algebra] Level 1
GT: -3,4
LLaMA answers: [-3,4](1), 4,-3(1), 4,x=-3(1)

idx=9 [algebra] Level 1
GT: n<0
LLaMA answers: 1(1), -1,1,3,4,5,6,7,8(1), hence,theexpressionisnegativew(1)

idx=10 [algebra] Level 2
GT: 14
LLaMA answers: cross-multiplying:$x*3=12(x-3)(1), 15(1), 4(1)

idx=11 [algebra] Level 2
GT: 2
LLaMA answers: 1.74(1), \frac{2(a+3)}{3(a+2(1), 1.8769230769(1)

idx=12 [algebra] Level 2
GT: (1,35),(2,23),(3,17),(5,11),(7,8),(8,7),(11,5),(17,3),(23,2)
LLaMA answers: (1,1)(1), 1,35,2,23,3,17,5,11(1), weknowthat$a$and$b$arefactorso(1)

idx=14 [algebra] Level 2
GT: 2x^2-5x+10
LLaMA answers: 2x^2-x(1), (2x+14)/(x(1), 4x(1)

idx=15 [algebra] Level 2
GT: (x^2-2y^3)(x^2+2y^3)(x^4-2x^2y^3+4y^6)(x^4+2x^2y^3+4y^6)
LLaMA answers: 0(1), (x^6-64y^18)=(x^6-64(1), (x^6-4y^18)^2(1)

idx=16 [algebra] Level 2
GT: none
LLaMA answers: 0(1), 1(1), 

In [16]:
print("Majority Vote Analysis")
print("=" * 60)

def get_majority_answer(answers):
    counts = Counter(answers)
    if not counts:
        return None, 0
    most_common = counts.most_common(1)[0]
    return most_common[0], most_common[1]

majority_correct = 0
majority_total = len(df_llama)

for _, row in df_llama.iterrows():
    majority_ans, count = get_majority_answer(row['llama_answers'])
    if answers_match(majority_ans, row['ground_truth']):
        majority_correct += 1

print(f"Majority vote accuracy: {majority_correct}/{majority_total} ({majority_correct/majority_total:.1%})")
print(f"Pass@1 (single sample):  {pass_at_k_unbiased(df_llama['correct_flags'].tolist(), 1):.1%}")
print(f"Pass@4 (any of 4):       {pass_at_k_unbiased(df_llama['correct_flags'].tolist(), 4):.1%}")

Majority Vote Analysis
Majority vote accuracy: 72/350 (20.6%)
Pass@1 (single sample):  16.1%
Pass@4 (any of 4):       31.4%


In [17]:
print("Majority Vote Accuracy by Subject")
print("=" * 50)

for subject in sorted(df_llama['subject'].unique()):
    subset = df_llama[df_llama['subject'] == subject]
    correct = 0
    for _, row in subset.iterrows():
        majority_ans, _ = get_majority_answer(row['llama_answers'])
        if answers_match(majority_ans, row['ground_truth']):
            correct += 1
    print(f"{subject:<25}: {correct:3d}/{len(subset):3d} ({correct/len(subset):.1%})")

Majority Vote Accuracy by Subject
algebra                  :  13/ 50 (26.0%)
counting_and_probability :  12/ 50 (24.0%)
geometry                 :   4/ 50 (8.0%)
intermediate_algebra     :  10/ 50 (20.0%)
number_theory            :   8/ 50 (16.0%)
prealgebra               :  19/ 50 (38.0%)
precalculus              :   6/ 50 (12.0%)


In [18]:
print("Majority Vote Accuracy by Level")
print("=" * 50)

for level in sorted(df_llama['level'].unique()):
    subset = df_llama[df_llama['level'] == level]
    correct = 0
    for _, row in subset.iterrows():
        majority_ans, _ = get_majority_answer(row['llama_answers'])
        if answers_match(majority_ans, row['ground_truth']):
            correct += 1
    print(f"Level {level}: {correct:3d}/{len(subset):3d} ({correct/len(subset):.1%})")

Majority Vote Accuracy by Level
Level 1:  18/ 70 (25.7%)
Level 2:  22/ 70 (31.4%)
Level 3:  15/ 70 (21.4%)
Level 4:  10/ 70 (14.3%)
Level 5:   7/ 70 (10.0%)
