In [1]:
import pandas as pd
import glob

def load_all_records_into_df(type):
    # Get all the verdict file names in the results/verdicts folder
    verdict_files = glob.glob(f'results/{type}/*.jsonl')

    # Load each verdict file into a dataframe
    dfs = []
    for file in verdict_files:
        df = pd.read_json(file, lines=True)
        if df.shape[0] == 0:
            continue

        # expand config
        config_df = pd.json_normalize(df['config'])
        config_df.columns = ['config_' + col for col in config_df.columns]
        df = pd.concat([df, config_df], axis=1)

        # make an option_str
        df['options_str'] = df['options'].apply(str)

        # put the type as suffix for every column
        df.columns = [col + '_' + type for col in df.columns]

        dfs.append(df)

    # Concatenate all the dataframes into one
    df = pd.concat(dfs)
    return df

verdict_df = load_all_records_into_df('verdicts')
debate_df = load_all_records_into_df('debates')
qa_df = load_all_records_into_df('qa')

  df = pd.concat(dfs)


In [2]:

verdict_and_debate_df = verdict_df.merge(debate_df, left_on=['record_id_verdicts'], right_on=['record_id_debates'], how='left')

all_df = verdict_and_debate_df.merge(
    qa_df, 
    left_on=['question_verdicts', 'options_str_verdicts', 'config_judge_model_verdicts'], 
    right_on=['question_qa', 'options_str_qa', 'config_model_name_qa'],
    how='left'
)

all_df = all_df[all_df['success_verdicts'] & (all_df['success_debates'] == True) & (all_df['success_qa'] == True)]


all_df['parsed_answer_qa'] = all_df['parsed_model_response_qa'].apply(lambda x: x['answer'])
all_df['parsed_answer_verdicts'] =  all_df['judge_verdict_verdicts'].apply(lambda x: x['parsed']['answer'])
all_df = all_df[all_df['parsed_answer_qa'].notna() & all_df['parsed_answer_verdicts'].notna()]




In [4]:
all_df.columns

Index(['success_verdicts', 'error_message_verdicts', 'verdict_run_id_verdicts',
       'record_id_verdicts', 'debate_run_id_verdicts', 'datetime_verdicts',
       'config_verdicts', 'question_verdicts', 'options_verdicts',
       'correct_idx_verdicts', 'prompt_template_verdicts',
       'judge_verdict_verdicts', 'config_debate_run_id_verdicts',
       'config_judge_model_verdicts', 'config_judge_temperature_verdicts',
       'config_max_output_tokens_verdicts',
       'config_judge_reasoning_effort_verdicts',
       'config_judge_reasoning_max_tokens_verdicts', 'options_str_verdicts',
       'config_max_threads_verdicts', 'config_skip_qa_verdicts',
       'config_specific_record_ids_verdicts', 'config_subset_n_verdicts',
       'config_rerun_verdicts', 'run_id_debates', 'record_id_debates',
       'datetime_debates', 'config_debates', 'prompt_template_debates',
       'question_idx_debates', 'question_debates', 'options_debates',
       'correct_idx_debates', 'success_debates', 'error

In [None]:
dedupe_columns = [
    "record_id_verdicts",
    "config_debate_run_id_verdicts",
    "config_dataset_name_debates", 
    "config_dataset_subset_debates", 
    "config_dataset_split_debates", 
    "config_debater_model_debates", 
    "config_debater_temperature_debates", 
    "config_random_seed_debates", 
    "config_num_choices_debates", 
    "config_num_turns_debates", 
    "config_private_scratchpad_debates",
    "config_public_argument_word_limit_debates",
    "config_private_reasoning_word_limit_debates",
    "config_judge_model_verdicts", 
    "config_judge_temperature_verdicts", 
    "config_max_output_tokens_verdicts"
]

unique_df = all_df.sort_values('datetime_verdicts').drop_duplicates(subset=dedupe_columns, keep='last')


# unique_configs = unique_df[dedupe_columns].to_dict('records')
# pd.DataFrame(unique_configs).reset_index().rename(columns={'index': 'unique_config_id'})


Unnamed: 0,unique_config_id,record_id_verdicts,config_debate_run_id_verdicts,config_dataset_name_debates,config_dataset_subset_debates,config_dataset_split_debates,config_debater_model_debates,config_debater_temperature_debates,config_random_seed_debates,config_num_choices_debates,config_num_turns_debates,config_private_scratchpad_debates,config_public_argument_word_limit_debates,config_private_reasoning_word_limit_debates,config_judge_model_verdicts,config_judge_temperature_verdicts,config_max_output_tokens_verdicts
0,0,6ekpf86,17zguxe,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,0.0,42.0,2.0,1.0,False,200.0,1000.0,openai/gpt-3.5-turbo,0.0,5000
1,1,ll8xvbp,17zguxe,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,0.0,42.0,2.0,1.0,False,200.0,1000.0,openai/gpt-3.5-turbo,0.0,5000
2,2,rvog26u,17zguxe,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,0.0,42.0,2.0,1.0,False,200.0,1000.0,openai/gpt-3.5-turbo,0.0,5000
3,3,3rswu8o,17zguxe,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,0.0,42.0,2.0,1.0,False,200.0,1000.0,openai/gpt-3.5-turbo,0.0,5000
4,4,j6o2ftx,17zguxe,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,0.0,42.0,2.0,1.0,False,200.0,1000.0,openai/gpt-3.5-turbo,0.0,5000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5233,5233,b1ox7uq,17zguxe,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,0.0,42.0,2.0,1.0,False,200.0,1000.0,qwen/qwen3-32b,0.0,5000
5234,5234,lg5jfnc,17zguxe,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,0.0,42.0,2.0,1.0,False,200.0,1000.0,qwen/qwen3-32b,0.0,5000
5235,5235,t5ygk7p,17zguxe,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,0.0,42.0,2.0,1.0,False,200.0,1000.0,qwen/qwen3-32b,0.0,5000
5236,5236,oqqeazp,17zguxe,Idavidrein/gpqa,gpqa_diamond,train,x-ai/grok-4-fast,0.0,42.0,2.0,1.0,False,200.0,1000.0,qwen/qwen3-32b,0.0,5000


In [23]:


config_filter = {
    'config_dataset_name_debates': 'Idavidrein/gpqa',
    'config_dataset_subset_debates': 'gpqa_diamond',
    'config_dataset_split_debates': 'train',
    'config_random_seed_debates': 42,
    'config_num_turns_debates': 1,
    'config_private_scratchpad_debates': False,
    'config_public_argument_word_limit_debates': 200,
    'config_judge_temperature_verdicts': 0.0,

    # vary
    'config_num_choices_debates': 4
}


mask = (unique_df[list(config_filter)]
        == pd.Series(config_filter)).all(axis=1)
filtered_df = unique_df[mask]

filtered_df.shape






(2631, 89)