In [1]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

In [2]:
# TODO: Fill in the filename
RESULTS_FILE = Path("results/llama_majority_vote_20260121_081618.json")

with open(RESULTS_FILE) as f:
    data = json.load(f)

config = data['config']
results = data['results']
print(f"Loaded {len(results)} problems")
print(f"LLaMA runs per problem: {config['llama_runs']}")

Loaded 100 problems
LLaMA runs per problem: 16


## Opus vs Gemini Agreement Analysis

In [3]:
def normalize_answer(answer):
    if answer is None:
        return None
    answer = str(answer).strip().lower()
    try:
        num = float(answer)
        if num == int(num):
            return str(int(num))
        return f"{num:.10f}".rstrip('0').rstrip('.')
    except ValueError:
        return answer

def answers_match(a, b):
    if a is None or b is None:
        return False
    norm_a = normalize_answer(a)
    norm_b = normalize_answer(b)
    if norm_a == norm_b:
        return True
    try:
        num_a = float(norm_a) if norm_a else None
        num_b = float(norm_b) if norm_b else None
        if num_a is not None and num_b is not None:
            if abs(num_b) > 1:
                return abs(num_a - num_b) / abs(num_b) < 1e-4
            return abs(num_a - num_b) < 1e-6
    except ValueError:
        pass
    return False

In [4]:
agreement_data = []
for idx, r in results.items():
    opus_ans = r['oracle_results']['opus'].get('parsed_answer')
    gemini_ans = r['oracle_results']['gemini'].get('parsed_answer')
    
    agreement_data.append({
        'idx': int(idx),
        'problem_source': r['problem_source'],
        'opus_answer': opus_ans,
        'gemini_answer': gemini_ans,
        'opus_none': opus_ans is None,
        'gemini_none': gemini_ans is None,
        'agree': answers_match(opus_ans, gemini_ans),
        'ground_truth': r['ground_truth']
    })

df_agreement = pd.DataFrame(agreement_data)
df_agreement.head()

Unnamed: 0,idx,problem_source,opus_answer,gemini_answer,opus_none,gemini_none,agree,ground_truth
0,328,math,8,8,False,False,True,8
1,2822,gsm8k,48,48,False,False,True,48
2,7349,math,-1011,-1011,False,False,True,-1011
3,6965,gsm8k,4,4,False,False,True,4
4,376,math,75,75,False,False,True,75


In [5]:
print("Overall Agreement Statistics")
print("=" * 40)
total = len(df_agreement)
agree = df_agreement['agree'].sum()
print(f"Total problems: {total}")
print(f"Opus & Gemini agree: {agree} ({agree/total*100:.1f}%)")
print(f"Opus & Gemini disagree: {total - agree} ({(total-agree)/total*100:.1f}%)")

Overall Agreement Statistics
Total problems: 100
Opus & Gemini agree: 87 (87.0%)
Opus & Gemini disagree: 13 (13.0%)


In [6]:
print("\nAgreement by Problem Source")
print("=" * 40)
for source in df_agreement['problem_source'].unique():
    subset = df_agreement[df_agreement['problem_source'] == source]
    n = len(subset)
    ag = subset['agree'].sum()
    print(f"{source:20s}: {ag}/{n} agree ({ag/n*100:.1f}%)")


Agreement by Problem Source
math                : 24/25 agree (96.0%)
gsm8k               : 24/25 agree (96.0%)
augmented_math      : 15/25 agree (60.0%)
augmented_gsm8k     : 24/25 agree (96.0%)


In [7]:
print("\nDisagreements (where both models gave an answer)")
print("=" * 60)
disagreements = df_agreement[(~df_agreement['agree']) & (~df_agreement['opus_none']) & (~df_agreement['gemini_none'])]
print(f"Count: {len(disagreements)}")
for _, row in disagreements.iterrows():
    print(f"  idx={row['idx']:5d} [{row['problem_source']:16s}] opus={row['opus_answer']} vs gemini={row['gemini_answer']}")


Disagreements (where both models gave an answer)
Count: 4
  idx= 1585 [augmented_math  ] opus=a^2/s^2 vs gemini=0.5
  idx=  883 [augmented_math  ] opus=π/4 vs gemini=pi/4
  idx= 2414 [augmented_math  ] opus=1 vs gemini=5
  idx= 7378 [augmented_gsm8k ] opus=1.5789473684 vs gemini=6


## Null Counts by Problem Source

In [9]:
null_data = []
for source in df_agreement['problem_source'].unique():
    subset = df_agreement[df_agreement['problem_source'] == source]
    null_data.append({
        'problem_source': source,
        'total': len(subset),
        'opus_nulls': subset['opus_none'].sum(),
        'gemini_nulls': subset['gemini_none'].sum(),
    })

df_nulls = pd.DataFrame(null_data)
df_nulls['opus_null_pct'] = (df_nulls['opus_nulls'] / df_nulls['total'] * 100).round(1)
df_nulls['gemini_null_pct'] = (df_nulls['gemini_nulls'] / df_nulls['total'] * 100).round(1)
df_nulls

Unnamed: 0,problem_source,total,opus_nulls,gemini_nulls,opus_null_pct,gemini_null_pct
0,math,25,1,1,4.0,4.0
1,gsm8k,25,0,1,0.0,4.0
2,augmented_math,25,7,4,28.0,16.0
3,augmented_gsm8k,25,0,0,0.0,0.0


In [10]:
print("Null Counts Summary")
print("=" * 50)
print(f"{'Source':<20} {'Opus Nulls':>12} {'Gemini Nulls':>14}")
print("-" * 50)
for _, row in df_nulls.iterrows():
    print(f"{row['problem_source']:<20} {row['opus_nulls']:>5}/{row['total']} ({row['opus_null_pct']:>4.1f}%) {row['gemini_nulls']:>5}/{row['total']} ({row['gemini_null_pct']:>4.1f}%)")
print("-" * 50)
print(f"{'TOTAL':<20} {df_nulls['opus_nulls'].sum():>5}/{df_nulls['total'].sum()} ({df_nulls['opus_nulls'].sum()/df_nulls['total'].sum()*100:>4.1f}%) {df_nulls['gemini_nulls'].sum():>5}/{df_nulls['total'].sum()} ({df_nulls['gemini_nulls'].sum()/df_nulls['total'].sum()*100:>4.1f}%)")

Null Counts Summary
Source                 Opus Nulls   Gemini Nulls
--------------------------------------------------
math                     1/25 ( 4.0%)     1/25 ( 4.0%)
gsm8k                    0/25 ( 0.0%)     1/25 ( 4.0%)
augmented_math           7/25 (28.0%)     4/25 (16.0%)
augmented_gsm8k          0/25 ( 0.0%)     0/25 ( 0.0%)
--------------------------------------------------
TOTAL                    8/100 ( 8.0%)     6/100 ( 6.0%)


## LLaMA Pass@k Performance

In [24]:
llama_data = []
for idx, r in results.items():
    gt = r['ground_truth']
    if gt is None:
        continue
    
    llama_answers = [lr['parsed_answer'] for lr in r['llama_results']]
    correct_flags = [answers_match(ans, gt) for ans in llama_answers]
    
    llama_data.append({
        'idx': int(idx),
        'problem_source': r['problem_source'],
        'ground_truth': gt,
        'llama_answers': llama_answers,
        'correct_flags': correct_flags,
        'num_correct': sum(correct_flags),
        'num_runs': len(correct_flags)
    })

df_llama = pd.DataFrame(llama_data)
print(f"Problems with valid GT: {len(df_llama)}")

Problems with valid GT: 87


In [25]:
def pass_at_k_first(correct_flags_list, k):
    """Pass@k (first k): is any of the first k samples correct?"""
    passes = 0
    for correct_flags in correct_flags_list:
        if any(correct_flags[:k]):
            passes += 1
    return passes / len(correct_flags_list) if correct_flags_list else 0

def pass_at_k_unbiased(correct_flags_list, k):
    """Pass@k (unbiased): expected probability of at least one correct in k random samples."""
    from math import comb
    total = 0
    for correct_flags in correct_flags_list:
        n = len(correct_flags)
        c = sum(correct_flags)
        if c == 0:
            prob = 0.0
        elif n - c < k:
            prob = 1.0
        else:
            prob = 1.0 - comb(n - c, k) / comb(n, k)
        total += prob
    return total / len(correct_flags_list) if correct_flags_list else 0

In [26]:
k_values = [1, 2, 4, 8, 16]
n_runs = config['llama_runs']
k_values = [k for k in k_values if k <= n_runs]

def compute_pass_at_k_table(pass_fn):
    results = []
    row = {'problem_source': 'OVERALL', 'n': len(df_llama)}
    for k in k_values:
        row[f'pass@{k}'] = pass_fn(df_llama['correct_flags'].tolist(), k)
    results.append(row)
    for source in sorted(df_llama['problem_source'].unique()):
        subset = df_llama[df_llama['problem_source'] == source]
        row = {'problem_source': source, 'n': len(subset)}
        for k in k_values:
            row[f'pass@{k}'] = pass_fn(subset['correct_flags'].tolist(), k)
        results.append(row)
    return pd.DataFrame(results)

df_pass_at_k_first = compute_pass_at_k_table(pass_at_k_first)
df_pass_at_k_unbiased = compute_pass_at_k_table(pass_at_k_unbiased)

In [33]:
def print_pass_at_k_table(df, title):
    print(title)
    print("=" * 70)
    header = f"{'Source':<20} {'n':>5}"
    for k in k_values:
        header += f" {'pass@'+str(k):>10}"
    print(header)
    print("-" * 70)
    for _, row in df.iterrows():
        line = f"{row['problem_source']:<20} {row['n']:>5}"
        for k in k_values:
            line += f" {row[f'pass@{k}']:>9.2f}"
        print(line)

print_pass_at_k_table(df_pass_at_k_first, "Pass@k (First k samples) - Did any of the first k get it right?")
print()
print_pass_at_k_table(df_pass_at_k_unbiased, "Pass@k (Unbiased estimator) - Expected prob of success in k random samples")

Pass@k (First k samples) - Did any of the first k get it right?
Source                   n     pass@1     pass@2     pass@4     pass@8    pass@16
----------------------------------------------------------------------
OVERALL                 87      0.62      0.77      0.86      0.90      0.90
augmented_gsm8k         24      0.62      0.75      0.88      0.92      0.92
augmented_math          15      0.27      0.60      0.60      0.60      0.60
gsm8k                   24      0.88      0.96      1.00      1.00      1.00
math                    24      0.58      0.71      0.88      0.96      0.96

Pass@k (Unbiased estimator) - Expected prob of success in k random samples
Source                   n     pass@1     pass@2     pass@4     pass@8    pass@16
----------------------------------------------------------------------
OVERALL                 87      0.62      0.75      0.84      0.88      0.90
augmented_gsm8k         24      0.60      0.76      0.87      0.91      0.92
augmented_math 

In [29]:
print("\nDetailed per-problem breakdown (first 20 with GT)")
print("=" * 80)
for _, row in df_llama.head(20).iterrows():
    n_correct = row['num_correct']
    n_total = row['num_runs']
    answers = Counter(row['llama_answers'])
    top_answers = answers.most_common(3)
    top_str = ", ".join([f"{a}({c})" for a, c in top_answers])
    print(f"idx={row['idx']:5d} [{row['problem_source']:16s}] GT={str(row['ground_truth'])[:15]:15s} correct={n_correct:2d}/{n_total} top_answers: {top_str}")


Detailed per-problem breakdown (first 20 with GT)
idx=  328 [math            ] GT=8               correct= 8/16 top_answers: 8(8), 12(1), 5(1)
idx= 2822 [gsm8k           ] GT=48              correct=16/16 top_answers: 48(16)
idx= 7349 [math            ] GT=-1011           correct= 5/16 top_answers: -1011(5), -1012(2), -984(1)
idx= 6965 [gsm8k           ] GT=4               correct=15/16 top_answers: 4(15), 12(1)
idx=  376 [math            ] GT=75              correct= 4/16 top_answers: 75(4), 55(4), sincetheanglexisthelargestofthethreeangles,thedegreemeasureofthelargestangleofthetriangleis75(1)
idx= 6856 [gsm8k           ] GT=12              correct=16/16 top_answers: 12(16)
idx= 3481 [gsm8k           ] GT=840             correct=14/16 top_answers: 840(14), 1196.4(1), theamountmr.mcphersonneedstoraiseis840.(1)
idx=  396 [math            ] GT=200             correct=14/16 top_answers: 200(14), inthiscase,thelengthis20andthewidthis10,sotheareaisa=(20)(10)=200.(1), 20*10=200(1)
idx= 2113

In [31]:
# Augmented math where LLaMA got 0/16 correct
zero_correct = df_llama[(df_llama['problem_source'] == 'augmented_math') & (df_llama['num_correct'] == 0)]

# Build table data
table_data = []
for _, row in zero_correct.iterrows():
    entry = {'idx': row['idx'], 'GT': str(row['ground_truth'])[:20]}
    for i, ans in enumerate(row['llama_answers']):
        entry[f'R{i+1}'] = str(ans)[:20] if ans else ''
    table_data.append(entry)

df_table = pd.DataFrame(table_data)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.width', None)
df_table

Unnamed: 0,idx,GT,R1,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,R13,R14,R15,R16
0,4668,"[-1/2,1]","11/16,1/2","[-sqrt(3)/2,1/2]",1.0,"[-sqrt(3)/2,1]","-sqrt(3)/2,sqrt(...","[-sqrt(3),sqrt(3...",-sqrt(3)/2to1/2,"[-1,1]","[-1,1]","[(sqrt(3)/2),1+s...","-1/2,1/2","[1/2,(√3-2)/2]",[-\frac{\sqrt{3}...,"[sqrt(3)/2,-1/2]","[-1,1]","[-1/2,1/2]"
1,1005,0.46,0.54,0.64,0.8,0.52,0.5,0.4,0.48,0.5,0.4,0.5,0.52,sotheprobability...,sotheprobability...,0.5891891892,0.54,0.56
2,4865,54,324,108,108.0,3^(85/4),108,17496,=27,9,9,2916,6,108,108,108,=27,9
3,4736,0.8,0.5714285714,12c2/8c2=(12*11)...,0.4285714286,0.4,0.4285714286,2.3571428571,0.4,0.4285714286,0.5714285714,0.4285714286,0.2142857143,0.2142857143,0.4285714286,0.8571428571,0.5714285714,0.4285714286
4,3720,159655911367680,910839570585,thereare,2183401051642208.0,35751298560000,209185488000,=702976+26*26*10*,218340105584896,26*26*26*10*10*1...,350443860192,"so,thetotalnumbe...",thereare26upperc...,"however,wehavetr...",208827064576,"so,thetotalnumbe...","208827,064576+20...",4181041056960
5,7081,0.5625,0.890625,0.59375,0.546875,0.125,0.640625,0.34375,0.421875,0.453125,0.640625,0.7407407407,0.234375,0.40625,0.421875,0.359375,0.109375,0.546875
