In [7]:
import pandas as pd
import json


In [8]:
results_file = "results/qa_results.jsonl"

# Filters (set to None to include all)
FILTER_MODEL_NAME = "openai/gpt-4o-mini"
FILTER_DATASET_NAME = "Idavidrein/gpqa"
FILTER_DATASET_SUBSET = "gpqa_diamond"
FILTER_DATASET_SPLIT = "train"
FILTER_RANDOM_SEED = 42
FILTER_NUM_CHOICES = 2

# Deduplication settings
REMOVE_DUPLICATES = True  # Remove duplicate runs (same question/config, keeping latest)


In [None]:
df = pd.read_json(results_file, lines=True)

# Expand config fields into separate columns
config_df = pd.json_normalize(df['config'])
config_df.columns = ['config_' + col for col in config_df.columns]
df = pd.concat([df, config_df], axis=1)

# Apply filters
df = df[(df['config_model_name'] == FILTER_MODEL_NAME) & 
        (df['config_dataset_name'] == FILTER_DATASET_NAME) & 
        (df['config_dataset_subset'] == FILTER_DATASET_SUBSET) & 
        (df['config_dataset_split'] == FILTER_DATASET_SPLIT) & 
        (df['config_random_seed'] == FILTER_RANDOM_SEED) & 
        (df['config_num_choices'] == FILTER_NUM_CHOICES)]

print(f"After filtering: {len(df)} results")

# Remove duplicates (keeping latest based on datetime)
if REMOVE_DUPLICATES:
    duplicate_cols = ['question_idx', 'config_model_name', 'config_dataset_name', 
                      'config_dataset_subset', 'config_dataset_split', 
                      'config_random_seed', 'config_num_choices']
    df = df.sort_values('datetime', ascending=False)
    df = df.drop_duplicates(subset=duplicate_cols, keep='first')
    print(f"After deduplication: {len(df)} results")

# Add parsed fields
df['parsed_answer'] = df['parsed_model_response'].apply(lambda x: x.get('answer'))
df['is_correct'] = df['parsed_answer'] == df['correct_idx']

df.head()

Unnamed: 0,question_idx,question,options,correct_idx,raw_model_response,parsed_model_response,prompt,parsed_answer,is_correct
0,28,You have an interesting drought-resistant cult...,"[Mutant 3, Mutant 2]",1,<BEGIN FINAL ANSWER>\nAnswer: 1\nConfidence: 9...,"{'is_valid': True, 'answer': 1, 'confidence': ...",Answer the following question. You must choose...,1,True
1,163,Astronomers are studying two binary star syste...,"[~ 0.4, ~ 0.6]",0,<THINKING>\nTo determine by what factor system...,"{'is_valid': True, 'answer': 0, 'confidence': ...",Answer the following question. You must choose...,0,True


In [6]:
valid_df = df[df['parsed_answer'].notna()]
accuracy = valid_df['is_correct'].mean()
correct_count = valid_df['is_correct'].sum()
total_count = len(valid_df)

print(f"Accuracy: {accuracy:.2%}")
print(f"Correct: {correct_count}/{total_count}")


Accuracy: 100.00%
Correct: 2/2
