In [143]:
import pandas as pd

### Analyze direct QA

In [174]:
def load_qa_results(filters=None):
    df = pd.read_json("results/qa_results.jsonl", lines=True)
    
    config_df = pd.json_normalize(df['config'])
    config_df.columns = ['config_' + col for col in config_df.columns]
    df = pd.concat([df, config_df], axis=1)

    df['options_str'] = df['options'].apply(str)

    df = df.sort_values('datetime', ascending=False).drop_duplicates(
        subset=['question', 'options_str', 'config_model_name'], keep='first')

    df['parsed_answer'] = df['parsed_model_response'].apply(lambda x: x.get('answer'))
    df = df[df['parsed_answer'].notna()]

    df['is_correct'] = df['parsed_answer'] == df['correct_idx']
    
    if filters:
        for col, val in filters.items():
            df = df[df[col] == val]
    
    return df

In [175]:
filters = {
    'config_model_name': "openai/gpt-4o-mini",
    # 'config_model_name': "x-ai/grok-4-fast",
    'config_dataset_name': "Idavidrein/gpqa",
    'config_dataset_subset': "gpqa_diamond",
    'config_dataset_split': "train",
    'config_random_seed': 42,
    'config_num_choices': 2,
}


In [177]:
df = load_qa_results(filters)

valid_df = df[df['parsed_answer'].notna()]
accuracy = valid_df['is_correct'].mean()
correct_count = valid_df['is_correct'].sum()
total_count = len(valid_df)

print(f"Accuracy: {accuracy:.2%}")
print(f"Correct: {correct_count}/{total_count}")

Accuracy: 56.85%
Correct: 112/197


### Analyze debate

In [178]:
def load_debate_results(run_id):
    # Load the debate results
    results_file = f"results/debate/{run_id}.jsonl"

    df = pd.read_json(results_file, lines=True)

    config_df = pd.json_normalize(df['config'])
    config_df.columns = ['config_' + col for col in config_df.columns]
    df = pd.concat([df, config_df], axis=1)
    
    df['options_str'] = df['options'].apply(str)

    # Add is correct field
    df['parsed_answer'] = df['judge_verdict'].apply(lambda x: x.get('parsed', {}).get('answer', {}))
    df = df[df['parsed_answer'].notna()]
    df['is_correct'] =  df['parsed_answer'] == df['correct_idx']

    return df

In [180]:
run_id = 'ey2anfr'
df = load_debate_results(run_id)

valid_df = df[df['parsed_answer'].notna()]
accuracy = valid_df['is_correct'].mean()
correct_count = valid_df['is_correct'].sum()
total_count = len(valid_df)

print(f"Accuracy: {accuracy:.2%}")
print(f"Correct: {correct_count}/{total_count}")

Accuracy: 74.75%
Correct: 148/198


### Compare QA and Debate

In [181]:
run_id = 'ey2anfr'

qa_df = load_qa_results(filters=None)
qa_df = qa_df[['question', 'options_str', 'config_model_name', 'is_correct']]

debate_df = load_debate_results(run_id)

merged_df = debate_df.merge(
    qa_df,
    left_on=['question', 'options_str', 'config_judge_model'],
    right_on=['question', 'options_str', 'config_model_name'],
    how='left', suffixes=('', '_judge_qa'))

merged_df = merged_df.merge(
    qa_df,
    left_on=['question', 'options_str', 'config_debater_model'],
    right_on=['question', 'options_str', 'config_model_name'],
    how='left', suffixes=('', '_debater_qa'))

# only keep rows where all is_correct fields are not None
merged_df = merged_df[merged_df['is_correct_judge_qa'].notna() & merged_df['is_correct_debater_qa'].notna() & merged_df['is_correct'].notna()]

print(f'Judge QA: {merged_df["is_correct_judge_qa"].mean():.2%} ({merged_df["is_correct_judge_qa"].sum():.0f}/{merged_df["is_correct_judge_qa"].count()})')
print(f'Debater QA: {merged_df["is_correct_debater_qa"].mean():.2%} ({merged_df["is_correct_debater_qa"].sum():.0f}/{merged_df["is_correct_debater_qa"].count()})')
print(f'Debate Verdict: {merged_df["is_correct"].mean():.2%} ({merged_df["is_correct"].sum()}/{merged_df["is_correct"].count()})')


Judge QA: 56.41% (110/195)
Debater QA: 91.79% (179/195)
Debate Verdict: 74.87% (146/195)
