# Debate Results Analysis

Analyze accuracy across different conditions:
- Debater Direct QA
- Judge Direct QA
- Judge After Interactive Debate
- Judge After Non-Interactive Debate


In [2]:
import pandas as pd
import glob


In [None]:
# Load the most recent master results CSV (now in subdirectories)
csv_files = glob.glob('test_debate_results/*/master_results_*.csv')
latest_csv = max(csv_files, key=lambda x: x.split('_')[-1]) if csv_files else None

if latest_csv:
    print(f"Loading: {latest_csv}")
    df = pd.read_csv(latest_csv)
    print(f"Total debates: {len(df)}")
else:
    print("No master results CSV found")


Loading: test_debate_results/master_results_seed1100_1200_n2_turns5_20251016_122310.csv
Total debates: 2


In [6]:
# Calculate accuracies
n_total = len(df)

accuracies = {
    'Debater Direct QA': (df['debater_direct_correct'].sum() / n_total * 100),
    'Judge Direct QA': (df['judge_direct_correct'].sum() / n_total * 100),
    'Judge After Interactive Debate': (df['interactive_judge_after_debate_correct'].sum() / n_total * 100),
    'Judge After Non-Interactive Debate': (df['non_interactive_judge_after_debate_correct'].sum() / n_total * 100)
}

print(f"\nAccuracies across {n_total} debates:\n")
print("="*50)
for condition, accuracy in accuracies.items():
    print(f"{condition:40s}: {accuracy:5.1f}%")
print("="*50)



Accuracies across 2 debates:

Debater Direct QA                       : 100.0%
Judge Direct QA                         : 100.0%
Judge After Interactive Debate          : 100.0%
Judge After Non-Interactive Debate      : 100.0%


In [7]:
# Show as DataFrame
results_df = pd.DataFrame([
    {'Condition': k, 'Accuracy (%)': v} for k, v in accuracies.items()
])
results_df


Unnamed: 0,Condition,Accuracy (%)
0,Debater Direct QA,100.0
1,Judge Direct QA,100.0
2,Judge After Interactive Debate,100.0
3,Judge After Non-Interactive Debate,100.0


In [8]:
# Average confidence levels
print("\nAverage Confidence Levels:\n")
print("="*50)
print(f"{'Debater Direct QA':40s}: {df['debater_confidence'].mean():.1f}%")
print(f"{'Judge Direct QA':40s}: {df['judge_confidence'].mean():.1f}%")
print(f"{'Judge After Interactive Debate':40s}: {df['interactive_judge_after_debate_confidence'].mean():.1f}%")
print(f"{'Judge After Non-Interactive Debate':40s}: {df['non_interactive_judge_after_debate_confidence'].mean():.1f}%")
print("="*50)



Average Confidence Levels:

Debater Direct QA                       : 100.0%
Judge Direct QA                         : 85.0%
Judge After Interactive Debate          : 87.5%
Judge After Non-Interactive Debate      : 87.5%


In [9]:
# Compare debate turns
print("\nAverage Debate Turns:\n")
print("="*50)
print(f"{'Interactive Debates':40s}: {df['interactive_debate_turns'].mean():.1f} turns")
print(f"{'Non-Interactive Debates':40s}: {df['non_interactive_debate_turns'].mean():.1f} turns")
print("="*50)



Average Debate Turns:

Interactive Debates                     : 4.0 turns
Non-Interactive Debates                 : 4.5 turns
