In [1]:
import json
import pandas as pd
from collections import defaultdict

In [2]:
with open('normalized_augmented_math_20260120_142545.json') as f:
    normalized = json.load(f)

with open('augmented_math_20260120_142545.json') as f:
    raw_data = json.load(f)

## 1. Count nulls per level

In [3]:
def get_level(qid):
    return qid.split('_')[0]

null_counts = defaultdict(lambda: defaultdict(int))
for qid, responses in normalized.items():
    level = get_level(qid)
    for model, value in responses.items():
        if value is None:
            null_counts[level][model] += 1

null_df = pd.DataFrame(null_counts).T.fillna(0).astype(int)
null_df = null_df.sort_index()
null_df['total'] = null_df.sum(axis=1)
print("Null counts per level:")
null_df

Null counts per level:


Unnamed: 0,llama,opus,pro_think,total
L1,1,0,0,1
L2,0,1,0,1
L3,1,0,0,1
L4,0,4,2,6
L5,3,1,0,4


## 2. Filter out questions with any null responses

In [4]:
filtered = {qid: resp for qid, resp in normalized.items() 
            if all(v is not None for v in resp.values())}

print(f"Original: {len(normalized)} questions")
print(f"After filtering: {len(filtered)} questions")
print(f"Removed: {len(normalized) - len(filtered)} questions with nulls")

Original: 125 questions
After filtering: 114 questions
Removed: 11 questions with nulls


## 3. Agreement between opus and pro_think by level

In [5]:
agreement_by_level = defaultdict(lambda: {'agree': 0, 'disagree': 0})

for qid, resp in filtered.items():
    level = get_level(qid)
    if resp['opus'] == resp['pro_think']:
        agreement_by_level[level]['agree'] += 1
    else:
        agreement_by_level[level]['disagree'] += 1

agreement_df = pd.DataFrame(agreement_by_level).T
agreement_df = agreement_df.sort_index()
agreement_df['total'] = agreement_df['agree'] + agreement_df['disagree']
agreement_df['agreement_rate'] = (agreement_df['agree'] / agreement_df['total'] * 100).round(1)
print("Agreement between opus and pro_think by level:")
agreement_df

Agreement between opus and pro_think by level:


Unnamed: 0,agree,disagree,total,agreement_rate
L1,24,0,24,100.0
L2,23,1,24,95.8
L3,24,0,24,100.0
L4,20,1,21,95.2
L5,20,1,21,95.2


## 4. Llama performance (using opus/pro_think agreement as GT)

In [6]:
llama_perf = defaultdict(lambda: {'correct': 0, 'total': 0})

for qid, resp in filtered.items():
    level = get_level(qid)
    if resp['opus'] == resp['pro_think']:  # only use when GT models agree
        gt = resp['opus']
        llama_perf[level]['total'] += 1
        if resp['llama'] == gt:
            llama_perf[level]['correct'] += 1

llama_df = pd.DataFrame(llama_perf).T
llama_df = llama_df.sort_index()
llama_df['accuracy'] = (llama_df['correct'] / llama_df['total'] * 100).round(1)
print("Llama 3.1-8B performance by level (GT = opus/pro_think agreement):")
llama_df

Llama 3.1-8B performance by level (GT = opus/pro_think agreement):


Unnamed: 0,correct,total,accuracy
L1,23,24,95.8
L2,20,23,87.0
L3,24,24,100.0
L4,18,20,90.0
L5,17,20,85.0


## 5. Response times by model and level

In [11]:
times_data = []
for model_name, results in raw_data['results'].items():
    for r in results:
        if 'elapsed_time' in r:
            times_data.append({
                'id': r['id'],
                'level': get_level(r['id']),
                'model': model_name,
                'elapsed_time': r['elapsed_time']
            })

times_df = pd.DataFrame(times_data)

pivot = times_df.pivot_table(
    index='level', 
    columns='model', 
    values='elapsed_time', 
    aggfunc='median'
).round(2)
pivot = pivot.sort_index()
print("Mean response time (seconds) by model and level:")
pivot

Mean response time (seconds) by model and level:


model,llama,opus,pro-think
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
L1,3.8,17.19,11.93
L2,2.81,23.93,19.22
L3,3.54,29.12,23.7
L4,5.05,35.87,25.87
L5,8.49,39.66,36.58


In [None]:
overall_times = times_df.groupby('model')['elapsed_time'].agg(['mean', 'std', 'min', 'max']).round(2)
print("Overall response time statistics by model:")
overall_times