In [7]:
import pandas as pd
import json


In [15]:
results_file = "results/qa_results.jsonl"

# Filters (set to None to include all)
# FILTER_MODEL_NAME = "openai/gpt-4o-mini"
FILTER_MODEL_NAME = "x-ai/grok-4-fast"
FILTER_DATASET_NAME = "Idavidrein/gpqa"
FILTER_DATASET_SUBSET = "gpqa_diamond"
FILTER_DATASET_SPLIT = "train"
FILTER_RANDOM_SEED = 42
FILTER_NUM_CHOICES = 2

# Deduplication settings
REMOVE_DUPLICATES = True  # Remove duplicate runs (same question/config, keeping latest)


In [16]:
df = pd.read_json(results_file, lines=True)

# Expand config fields into separate columns
config_df = pd.json_normalize(df['config'])
config_df.columns = ['config_' + col for col in config_df.columns]
df = pd.concat([df, config_df], axis=1)

# Apply filters
df = df[(df['config_model_name'] == FILTER_MODEL_NAME) & 
        (df['config_dataset_name'] == FILTER_DATASET_NAME) & 
        (df['config_dataset_subset'] == FILTER_DATASET_SUBSET) & 
        (df['config_dataset_split'] == FILTER_DATASET_SPLIT) & 
        (df['config_random_seed'] == FILTER_RANDOM_SEED) & 
        (df['config_num_choices'] == FILTER_NUM_CHOICES)]

print(f"After filtering: {len(df)} results")

# Remove duplicates (keeping latest based on datetime)
if REMOVE_DUPLICATES:
    duplicate_cols = ['question_idx', 'config_model_name', 'config_dataset_name', 
                      'config_dataset_subset', 'config_dataset_split', 
                      'config_random_seed', 'config_num_choices']
    df = df.sort_values('datetime', ascending=False)
    df = df.drop_duplicates(subset=duplicate_cols, keep='first')
    print(f"After deduplication: {len(df)} results")

# Add parsed fields
df['parsed_answer'] = df['parsed_model_response'].apply(lambda x: x.get('answer'))
df['is_correct'] = df['parsed_answer'] == df['correct_idx']

df.head()

After filtering: 198 results
After deduplication: 198 results


Unnamed: 0,run_id,datetime,config,question_idx,question,options,correct_idx,raw_model_response,parsed_model_response,prompt,...,config_dataset_name,config_dataset_subset,config_dataset_split,config_model_name,config_num_questions,config_random_seed,config_num_choices,config_max_threads,parsed_answer,is_correct
200,hbjhnte,2025-10-28 19:56:10.806673,"{'dataset_name': 'Idavidrein/gpqa', 'dataset_s...",141,Consider this density matrix\n\n$\rho=\frac{1}...,"[r=(1,1,1), r=(0,0,0)]",1,<THINKING>\nThe given density matrix ρ = (1/2)...,"{'is_valid': True, 'answer': 1, 'confidence': ...",Answer the following question. You must choose...,...,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,198,42,2,5000,1.0,True
324,hbjhnte,2025-10-28 19:56:10.806673,"{'dataset_name': 'Idavidrein/gpqa', 'dataset_s...",132,We have a solution which has the volume of 200...,"[2.81x10^-7 M, 6.24x10^-7 M]",1,<THINKING>\nThe solution is a phosphate buffer...,"{'is_valid': True, 'answer': 1, 'confidence': ...",Answer the following question. You must choose...,...,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,198,42,2,5000,1.0,True
326,hbjhnte,2025-10-28 19:56:10.806673,"{'dataset_name': 'Idavidrein/gpqa', 'dataset_s...",22,"In an experiment, a researcher reacted ((2,2-d...","[(4-bromo-2,2-dimethylbutoxy)benzene and (3-br...",1,<THINKING>\nThe reactant is Ph-O-CH2-C(CH3)2-C...,"{'is_valid': True, 'answer': 1, 'confidence': ...",Answer the following question. You must choose...,...,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,198,42,2,5000,1.0,True
327,hbjhnte,2025-10-28 19:56:10.806673,"{'dataset_name': 'Idavidrein/gpqa', 'dataset_s...",110,Complete the following reactions and select th...,"[A = ethyl 3-(3-ethyl-3,5-dimethyl-4-oxocycloh...",1,<BEGIN FINAL ANSWER>\nAnswer: 1\nConfidence: 9...,"{'is_valid': True, 'answer': 1, 'confidence': ...",Answer the following question. You must choose...,...,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,198,42,2,5000,1.0,True
328,hbjhnte,2025-10-28 19:56:10.806673,"{'dataset_name': 'Idavidrein/gpqa', 'dataset_s...",192,"In a specific region of the sky, astronomers h...","[~ r^3, ~ r^4]",0,"<THINKING>\nParallax plx ∝ 1/r. Assume ""the nu...","{'is_valid': True, 'answer': 0, 'confidence': ...",Answer the following question. You must choose...,...,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,198,42,2,5000,0.0,True


In [17]:
valid_df = df[df['parsed_answer'].notna()]
accuracy = valid_df['is_correct'].mean()
correct_count = valid_df['is_correct'].sum()
total_count = len(valid_df)

print(f"Accuracy: {accuracy:.2%}")
print(f"Correct: {correct_count}/{total_count}")
print(f"Null answers: {df['parsed_answer'].isna().sum()}")
# print the number of is_valid is false



Accuracy: 90.59%
Correct: 154/170
Null answers: 28


In [40]:
df[df['parsed_answer'].isna()].iloc[6]['raw_model_response'].split('Answer:')
# df[df['parsed_answer'].isna()].iloc[2]

["<THINKING>\nThe reaction conditions (NaNO₂, HCl, H₂O) indicate nitrosation of ketones to form α-oximino ketones, which upon hydrolysis yield 1,2-diketones. For A, the product 4-isopropylcyclohexane-1,2-dione requires starting from the corresponding ketone, 4-isopropylcyclohexan-1-one (option 0). Option 1's 4-isopropyl-2-methoxycyclohexan-1-ol is not a ketone and unlikely to yield the dione under these conditions. For B, both options match 5-methylhexan-2-one, which can undergo nitrosation at the α-position (carbon 3) to give 5-methylhexane-2,3-dione. Thus, option 0 is correct.\n</THINKING>\n\n<BEGIN FINAL ANSWER>\n",
 ' <0>\nConfidence: <100%>\nReasoning: The reaction converts ketones to 1,2-diketones via nitrosation and hydrolysis. A must be 4-isopropylcyclohexan-1-one to yield the cyclic dione; the alternative in option 1 is not suitable. B is 5-methylhexan-2-one in both, fitting the acyclic dione.\n</END FINAL ANSWER>']

In [14]:
df.columns

Index(['run_id', 'datetime', 'config', 'question_idx', 'question', 'options',
       'correct_idx', 'raw_model_response', 'parsed_model_response', 'prompt',
       'token_usage', 'config_dataset_name', 'config_dataset_subset',
       'config_dataset_split', 'config_model_name', 'config_num_questions',
       'config_random_seed', 'config_num_choices', 'config_max_threads',
       'parsed_answer', 'is_correct'],
      dtype='object')