In [1]:
import pandas as pd
import json


### Analyze direct QA

In [77]:
results_file = "results/qa_results.jsonl"

# Filters (set to None to include all)
FILTER_MODEL_NAME = "openai/gpt-4o-mini"
# FILTER_MODEL_NAME = "x-ai/grok-4-fast"
FILTER_DATASET_NAME = "Idavidrein/gpqa"
FILTER_DATASET_SUBSET = "gpqa_diamond"
FILTER_DATASET_SPLIT = "train"
FILTER_RANDOM_SEED = 42
FILTER_NUM_CHOICES = 2
FILTER_TEMPERATURE = 0.0

# Deduplication settings
REMOVE_DUPLICATES = True  # Remove duplicate runs (same question/config, keeping latest)


In [78]:
df = pd.read_json(results_file, lines=True)

# Expand config fields into separate columns
config_df = pd.json_normalize(df['config'])
config_df.columns = ['config_' + col for col in config_df.columns]
df = pd.concat([df, config_df], axis=1)

# Apply filters
df = df[(df['config_model_name'] == FILTER_MODEL_NAME) & 
        (df['config_dataset_name'] == FILTER_DATASET_NAME) & 
        (df['config_dataset_subset'] == FILTER_DATASET_SUBSET) & 
        (df['config_dataset_split'] == FILTER_DATASET_SPLIT) & 
        (df['config_random_seed'] == FILTER_RANDOM_SEED) & 
        (df['config_num_choices'] == FILTER_NUM_CHOICES) &
        (df['config_temperature'] == FILTER_TEMPERATURE)]

print(f"After filtering: {len(df)} results")

# Remove duplicates (keeping latest based on datetime)
if REMOVE_DUPLICATES:
    duplicate_cols = ['question_idx', 'config_model_name', 'config_dataset_name', 
                      'config_dataset_subset', 'config_dataset_split', 
                      'config_random_seed', 'config_num_choices']
    df = df.sort_values('datetime', ascending=False)
    df = df.drop_duplicates(subset=duplicate_cols, keep='first')
    print(f"After deduplication: {len(df)} results")

# Add parsed fields
df['parsed_answer'] = df['parsed_model_response'].apply(lambda x: x.get('answer'))
df['is_correct'] = df['parsed_answer'] == df['correct_idx']

df.head()

After filtering: 198 results
After deduplication: 198 results


Unnamed: 0,run_id,datetime,config,question_idx,question,options,correct_idx,raw_model_response,parsed_model_response,prompt,...,config_dataset_subset,config_dataset_split,config_model_name,config_temperature,config_num_questions,config_random_seed,config_num_choices,config_max_threads,parsed_answer,is_correct
198,6vduke2,2025-10-28 21:49:02.511789,"{'dataset_name': 'Idavidrein/gpqa', 'dataset_s...",101,You identified a new ligand-receptor pair init...,[the enhancer for the ligand and receptor expr...,1,<BEGIN FINAL ANSWER>\nAnswer: 0\nConfidence: 8...,"{'is_valid': True, 'answer': 0, 'confidence': ...",Answer the following question. You must choose...,...,gpqa_diamond,train,openai/gpt-4o-mini,0.0,198,42,2,5000,0.0,False
322,6vduke2,2025-10-28 21:49:02.511789,"{'dataset_name': 'Idavidrein/gpqa', 'dataset_s...",127,"Congratulations, you just landed your dream jo...",[ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGT...,1,<THINKING>\nTo determine which plasmid to use ...,"{'is_valid': True, 'answer': 1, 'confidence': ...",Answer the following question. You must choose...,...,gpqa_diamond,train,openai/gpt-4o-mini,0.0,198,42,2,5000,1.0,True
324,6vduke2,2025-10-28 21:49:02.511789,"{'dataset_name': 'Idavidrein/gpqa', 'dataset_s...",177,Given the following Lagrangian\n\n\mathcal{L}_...,[The mass dimension \left[\kappa\right]_{M}=-1...,0,<THINKING>\nTo determine the mass dimension of...,"{'is_valid': True, 'answer': 1, 'confidence': ...",Answer the following question. You must choose...,...,gpqa_diamond,train,openai/gpt-4o-mini,0.0,198,42,2,5000,1.0,False
325,6vduke2,2025-10-28 21:49:02.511789,"{'dataset_name': 'Idavidrein/gpqa', 'dataset_s...",136,A synchrocyclotron is a special type of cyclot...,"[3536, 5300]",0,<THINKING>\nTo determine the number of revolut...,"{'is_valid': True, 'answer': 1, 'confidence': ...",Answer the following question. You must choose...,...,gpqa_diamond,train,openai/gpt-4o-mini,0.0,198,42,2,5000,1.0,False
326,6vduke2,2025-10-28 21:49:02.511789,"{'dataset_name': 'Idavidrein/gpqa', 'dataset_s...",189,"The reaction of an electron pair donor, nucleo...","[5, 2, 1, 3 and 4, 5, 2, 3, 1 and 4]",0,<THINKING>\nTo determine the reactivity of the...,"{'is_valid': True, 'answer': 1, 'confidence': ...",Answer the following question. You must choose...,...,gpqa_diamond,train,openai/gpt-4o-mini,0.0,198,42,2,5000,1.0,False


In [None]:
valid_df = df[df['parsed_answer'].notna()]
accuracy = valid_df['is_correct'].mean()
correct_count = valid_df['is_correct'].sum()
total_count = len(valid_df)

print(f"Accuracy: {accuracy:.2%}")
print(f"Correct: {correct_count}/{total_count}")
print(f"Null answers: {df['parsed_answer'].isna().sum()}")
# print the number of is_valid is false




Accuracy: 56.85%
Correct: 112/197
Null answers: 1


### Analyze debate

In [None]:
# Load the debate results
run_id = 'apc2ec7'
results_file = f"results/debate/{run_id}.jsonl"

debate_df = pd.read_json(results_file, lines=True)

# Expand config fields into separate columns
config_df = pd.json_normalize(debate_df['config'])
config_df.columns = ['config_' + col for col in config_df.columns]
debate_df = pd.concat([debate_df, config_df], axis=1)

# Add is correct field
debate_df['parsed_answer'] = debate_df['judge_verdict'].apply(lambda x: x.get('parsed', {}).get('answer', {}))
debate_df['is_correct'] = debate_df['parsed_answer'] == debate_df['correct_idx']


In [44]:
valid_df = debate_df[debate_df['parsed_answer'].notna()]
accuracy = valid_df['is_correct'].mean()
correct_count = valid_df['is_correct'].sum()
total_count = len(valid_df)

print(f"Accuracy: {accuracy:.2%}")
print(f"Correct: {correct_count}/{total_count}")
print(f"Null answers: {valid_df['parsed_answer'].isna().sum()}")


Accuracy: 100.00%
Correct: 2/2
Null answers: 0
