### Setup

In [1]:
# magic reload
%load_ext autoreload
%autoreload 2

import pandas as pd
import glob
from utils.analysis2_utils import *


In [2]:


def load_all_records_into_df(type):
    # Get all the verdict file names in the results/verdicts folder
    verdict_files = glob.glob(f'results/{type}/*.jsonl')

    # Load each verdict file into a dataframe
    dfs = []
    for file in verdict_files:
        df = pd.read_json(file, lines=True)
        if df.shape[0] == 0:
            continue

        # expand config
        config_df = pd.json_normalize(df['config'])
        config_df.columns = ['config_' + col for col in config_df.columns]
        df = pd.concat([df, config_df], axis=1)

        # make an option_str
        df['options_str'] = df['options'].apply(str)

        # put the type as suffix for every column
        df.columns = [col + '_' + type for col in df.columns]

        dfs.append(df)

    # Concatenate all the dataframes into one
    df = pd.concat(dfs)
    return df

# Get all the records
verdict_df = load_all_records_into_df('verdicts')
debate_df = load_all_records_into_df('debates')
qa_df = load_all_records_into_df('qa')

  df = pd.concat(dfs)


In [12]:
qa_df.columns

Index(['run_id_qa', 'datetime_qa', 'config_qa', 'question_idx_qa',
       'question_qa', 'options_qa', 'correct_idx_qa', 'raw_model_response_qa',
       'parsed_model_response_qa', 'prompt_qa', 'token_usage_qa',
       'record_id_qa', 'prompt_template_qa', 'internal_model_reasoning_qa',
       'internal_model_reasoning_details_qa', 'success_qa', 'error_message_qa',
       'config_dataset_name_qa', 'config_dataset_subset_qa',
       'config_dataset_split_qa', 'config_model_name_qa',
       'config_temperature_qa', 'config_num_questions_qa',
       'config_random_seed_qa', 'config_num_choices_qa',
       'config_max_threads_qa', 'config_specific_question_idxs_qa',
       'config_rerun_qa', 'options_str_qa'],
      dtype='object')

In [87]:
# Merge them.

verdict_and_debate_df = verdict_df.merge(debate_df, left_on=['record_id_verdicts'], right_on=['record_id_debates'], how='left')

judge_qa_df = qa_df.copy()
judge_qa_df.columns = [col + '_judge' for col in qa_df.columns]

debater_qa_df = qa_df.copy()
debater_qa_df.columns = [col + '_debater' for col in qa_df.columns]


all_df = verdict_and_debate_df.merge(
    judge_qa_df[['question_qa_judge', 'options_str_qa_judge', 'config_model_name_qa_judge', 'parsed_model_response_qa_judge', 'success_qa_judge']], 
    left_on=['question_verdicts', 'options_str_verdicts', 'config_judge_model_verdicts'], 
    right_on=['question_qa_judge', 'options_str_qa_judge', 'config_model_name_qa_judge'],
    how='left'
)

all_df = all_df.merge(
    debater_qa_df[['question_qa_debater', 'options_str_qa_debater', 'config_model_name_qa_debater', 'parsed_model_response_qa_debater', 'success_qa_debater']], 
    left_on=['question_verdicts', 'options_str_verdicts', 'config_debater_model_debates'], 
    right_on=['question_qa_debater', 'options_str_qa_debater', 'config_model_name_qa_debater'],
    how='left',
    suffixes=('', '_debater')
)

# # Filter by success
# all_df = all_df[all_df['success_verdicts'] & (all_df['success_debates'] == True) & (all_df['success_qa_judge'] == True) & (all_df['success_qa_debater'] == True)]

# Get and filter by answer
# all_df['parsed_answer_qa_judge'] = all_df['parsed_model_response_qa_judge'].apply(lambda x: x.get('answer', None))
all_df['parsed_answer_qa_judge'] = all_df['parsed_model_response_qa_judge'].apply(lambda x: x.get('answer') if pd.notna(x) else None)
all_df['parsed_answer_qa_debater'] = all_df['parsed_model_response_qa_debater'].apply(lambda x: x.get('answer') if pd.notna(x) else None)
all_df['parsed_answer_verdicts'] = all_df['judge_verdict_verdicts'].apply(lambda x: x.get('parsed', {}).get('answer') if pd.notna(x) and isinstance(x, dict) else None)
all_df = all_df[all_df['parsed_answer_qa_judge'].notnull() & all_df['parsed_answer_qa_debater'].notnull() & all_df['parsed_answer_verdicts'].notnull()]

# Add the correct columns
all_df['is_correct_qa_judge'] = all_df['parsed_answer_qa_judge'] == all_df['correct_idx_verdicts']
all_df['is_correct_qa_debater'] = all_df['parsed_answer_qa_debater'] == all_df['correct_idx_verdicts']
all_df['is_correct_verdict'] = all_df['parsed_answer_verdicts'] == all_df['correct_idx_verdicts']

In [None]:
# Dedupe them

dedupe_columns = [
    "record_id_verdicts",
    "config_debate_run_id_verdicts",
    "config_dataset_name_debates", 
    "config_dataset_subset_debates", 
    "config_dataset_split_debates", 
    "config_debater_model_debates", 
    "config_debater_temperature_debates", 
    "config_random_seed_debates", 
    "config_num_choices_debates", 
    "config_num_turns_debates", 
    # "config_private_scratchpad_debates",
    "config_public_argument_word_limit_debates",
    # "config_private_reasoning_word_limit_debates",
    "config_judge_model_verdicts", 
    "config_judge_temperature_verdicts", 
    "config_max_output_tokens_verdicts"
]

unique_df = all_df.sort_values('datetime_verdicts').drop_duplicates(subset=dedupe_columns, keep='last')


# unique_configs = unique_df[dedupe_columns].to_dict('records')
# pd.DataFrame(unique_configs).reset_index().rename(columns={'index': 'unique_config_id'})


In [124]:
# Filter to just what want to work with

config_filter = {
    'config_dataset_name_debates': 'Idavidrein/gpqa',
    'config_dataset_subset_debates': 'gpqa_diamond',
    'config_dataset_split_debates': 'train',
    'config_random_seed_debates': 42,
    'config_num_turns_debates': 1,
    'config_private_scratchpad_debates': False,
    'config_public_argument_word_limit_debates': 200,
    'config_judge_temperature_verdicts': 0.0,
}


mask = (unique_df[list(config_filter)]
        == pd.Series(config_filter)).all(axis=1)

df = unique_df[mask]


In [125]:
unique_df[unique_df['config_judge_model_verdicts'] == 'openai/gpt-4o-mini']['config_num_choices_debates'].value_counts()

config_num_choices_debates
2.0    2484
4.0    1601
Name: count, dtype: int64

In [126]:
unique_df[(unique_df['config_judge_model_verdicts'] == 'openai/gpt-4o-mini') & (df['config_num_choices_debates'] == 2)]['verdict_run_id_verdicts'].value_counts()

  unique_df[(unique_df['config_judge_model_verdicts'] == 'openai/gpt-4o-mini') & (df['config_num_choices_debates'] == 2)]['verdict_run_id_verdicts'].value_counts()


verdict_run_id_verdicts
k26y5y6    192
uh2gayc    190
qwf1c51      1
Name: count, dtype: int64

### Analyze accuracy

In [94]:
acc_df = aggregate_by_fields(df[df['config_num_choices_debates'] == 2], ['config_judge_model_verdicts'])

acc_df

# from utils.analysis_utils import *
# plot_results_by_name(acc_df)

Unnamed: 0,name,debater_qa_acc,judge_qa_acc,verdict_acc,debater_qa_n_correct,judge_qa_n_correct,verdict_n_correct,n_total,verdict_minus_judge_qa,pgr,debater_minus_judge_qa
0,google/gemma-3-12b-it,0.914286,0.588571,0.588571,160,103,103,175,0.0,0.0,0.325714
1,google/gemma-3-27b-it,0.937173,0.638743,0.65445,179,122,125,191,0.015707,0.052632,0.298429
2,meta-llama/llama-3-8b-instruct,0.922652,0.524862,0.491713,167,95,89,181,-0.033149,-0.083333,0.39779
3,meta-llama/llama-3.1-405b-instruct,0.927083,0.703125,0.635417,178,135,122,192,-0.067708,-0.302326,0.223958
4,meta-llama/llama-3.1-70b-instruct,0.913978,0.645161,0.634409,170,120,118,186,-0.010753,-0.04,0.268817
5,meta-llama/llama-3.1-8b-instruct,0.932886,0.530201,0.483221,139,79,72,149,-0.04698,-0.116667,0.402685
6,meta-llama/llama-3.3-70b-instruct,0.939891,0.699454,0.715847,172,128,131,183,0.016393,0.068182,0.240437
7,meta-llama/llama-4-maverick,0.912698,0.81746,0.81746,115,103,103,126,0.0,0.0,0.095238
8,meta-llama/llama-4-scout,0.937143,0.697143,0.72,164,122,126,175,0.022857,0.095238,0.24
9,openai/gpt-3.5-turbo,0.925926,0.492063,0.444444,175,93,84,189,-0.047619,-0.109756,0.433862
