# Debate Results Analysis

Analyze accuracy across different conditions:
- Debater Direct QA
- Judge Direct QA
- Judge After Interactive Debate
- Judge After Non-Interactive Debate


In [16]:
import pandas as pd
import glob


In [31]:
# Load the most recent master results CSV (now in subdirectories)
# latest_csv = 'test_debate_results/master_results_random_n100_20251016_132254/master_results_random_n100_20251016_132254.csv'
latest_csv = 'test_debate_results/master_results_random_n100_20251017_091329/master_results_random_n100_20251017_091329.csv'

if latest_csv:
    print(f"Loading: {latest_csv}")
    df = pd.read_csv(latest_csv)
    print(f"Total debates: {len(df)}")
else:
    print("No master results CSV found")


Loading: test_debate_results/master_results_random_n100_20251017_091329/master_results_random_n100_20251017_091329.csv
Total debates: 97


In [32]:
# Data Summary: Category Counts
print("\n" + "="*60)
print("DATA SUMMARY")
print("="*60)
print(f"Total debates: {len(df)}")
print()
print("Direct QA Modes:")
print(f"  Debater Direct QA entries      : {df['debater_direct_correct'].notna().sum()}")
print(f"  Judge Direct QA entries        : {df['judge_direct_correct'].notna().sum()}")
print()
print("Debate Modes:")
print(f"  Interactive Debate entries     : {df['interactive_judge_after_debate_correct'].notna().sum()}")
print(f"  Non-Interactive Debate entries : {df['non_interactive_judge_after_debate_correct'].notna().sum()}")
print("="*60)



DATA SUMMARY
Total debates: 97

Direct QA Modes:
  Debater Direct QA entries      : 97
  Judge Direct QA entries        : 97

Debate Modes:
  Interactive Debate entries     : 85
  Non-Interactive Debate entries : 75


In [35]:
# Calculate accuracies with proper handling of missing values
n_total = len(df)

# Count valid (non-empty) entries for each category
counts = {
    'Debater Direct QA': df['debater_direct_correct'].notna().sum(),
    'Judge Direct QA': df['judge_direct_correct'].notna().sum(),
    'Judge After Interactive Debate': df['interactive_judge_after_debate_correct'].notna().sum(),
    'Judge After Non-Interactive Debate': df['non_interactive_judge_after_debate_correct'].notna().sum()
}

# Calculate accuracies only from valid entries
accuracies = {
    'Debater Direct QA': (df['debater_direct_correct'].sum() / counts['Debater Direct QA'] * 100),
    'Judge Direct QA': (df['judge_direct_correct'].sum() / counts['Judge Direct QA'] * 100),
    'Judge After Interactive Debate': (df['interactive_judge_after_debate_correct'].sum() / counts['Judge After Interactive Debate'] * 100),
    'Judge After Non-Interactive Debate': (df['non_interactive_judge_after_debate_correct'].sum() / counts['Judge After Non-Interactive Debate'] * 100)
}

print(f"\nAccuracies across {n_total} debates:\n")
print("="*60)
for condition in accuracies.keys():
    print(f"{condition:40s}: {accuracies[condition]:5.1f}% (n={counts[condition]})")
print("="*60)



Accuracies across 97 debates:

Debater Direct QA                       :  55.7% (n=97)
Judge Direct QA                         :  57.7% (n=97)
Judge After Interactive Debate          :  60.0% (n=85)
Judge After Non-Interactive Debate      :  56.0% (n=75)


In [36]:
# Show as DataFrame
results_df = pd.DataFrame([
    {'Condition': k, 'Accuracy (%)': v} for k, v in accuracies.items()
])
results_df


Unnamed: 0,Condition,Accuracy (%)
0,Debater Direct QA,55.670103
1,Judge Direct QA,57.731959
2,Judge After Interactive Debate,60.0
3,Judge After Non-Interactive Debate,56.0


In [37]:
# Average confidence levels with counts
print("\nAverage Confidence Levels:\n")
print("="*60)

confidence_cols = {
    'Debater Direct QA': 'debater_confidence',
    'Judge Direct QA': 'judge_confidence',
    'Judge After Interactive Debate': 'interactive_judge_after_debate_confidence',
    'Judge After Non-Interactive Debate': 'non_interactive_judge_after_debate_confidence'
}

for label, col in confidence_cols.items():
    valid_count = df[col].notna().sum()
    avg_conf = df[col].mean()  # .mean() automatically ignores NaN
    print(f"{label:40s}: {avg_conf:.1f}% (n={valid_count})")
    
print("="*60)



Average Confidence Levels:

Debater Direct QA                       : 85.6% (n=97)
Judge Direct QA                         : 85.6% (n=97)
Judge After Interactive Debate          : 80.1% (n=85)
Judge After Non-Interactive Debate      : 81.1% (n=75)


In [38]:
# Compare confidence on correct vs incorrect answers
print("\nConfidence Levels: Correct vs Incorrect Answers:\n")
print("="*80)

categories = [
    ('Debater Direct QA', 'debater_direct_correct', 'debater_confidence'),
    ('Judge Direct QA', 'judge_direct_correct', 'judge_confidence'),
    ('Judge After Interactive Debate', 'interactive_judge_after_debate_correct', 'interactive_judge_after_debate_confidence'),
    ('Judge After Non-Interactive Debate', 'non_interactive_judge_after_debate_correct', 'non_interactive_judge_after_debate_confidence')
]

for label, correct_col, conf_col in categories:
    # Filter for correct and incorrect answers with valid confidence values
    correct_mask = (df[correct_col] == True) & (df[conf_col].notna())
    incorrect_mask = (df[correct_col] == False) & (df[conf_col].notna())
    
    correct_count = correct_mask.sum()
    incorrect_count = incorrect_mask.sum()
    
    correct_conf = df.loc[correct_mask, conf_col].mean() if correct_count > 0 else 0
    incorrect_conf = df.loc[incorrect_mask, conf_col].mean() if incorrect_count > 0 else 0
    
    print(f"\n{label}:")
    print(f"  Correct   (n={correct_count:2d}): {correct_conf:5.1f}% confidence")
    print(f"  Incorrect (n={incorrect_count:2d}): {incorrect_conf:5.1f}% confidence")
    
    if correct_count > 0 and incorrect_count > 0:
        diff = correct_conf - incorrect_conf
        print(f"  Difference: {diff:+5.1f}% (correct - incorrect)")

print("\n" + "="*80)



Confidence Levels: Correct vs Incorrect Answers:


Debater Direct QA:
  Correct   (n=54):  87.0% confidence
  Incorrect (n=43):  83.8% confidence
  Difference:  +3.2% (correct - incorrect)

Judge Direct QA:
  Correct   (n=56):  85.7% confidence
  Incorrect (n=41):  85.4% confidence
  Difference:  +0.3% (correct - incorrect)

Judge After Interactive Debate:
  Correct   (n=51):  79.9% confidence
  Incorrect (n=34):  80.4% confidence
  Difference:  -0.5% (correct - incorrect)

Judge After Non-Interactive Debate:
  Correct   (n=42):  81.4% confidence
  Incorrect (n=33):  80.6% confidence
  Difference:  +0.8% (correct - incorrect)



In [39]:
# Analyze which questions each category answered
print("\nQuestion Coverage Analysis:\n")
print("="*80)

# Create boolean masks for which questions were answered by each category
has_debater = df['debater_direct_correct'].notna()
has_judge = df['judge_direct_correct'].notna()
has_interactive = df['interactive_judge_after_debate_correct'].notna()
has_non_interactive = df['non_interactive_judge_after_debate_correct'].notna()

# Questions answered by all categories
all_answered = has_debater & has_judge & has_interactive & has_non_interactive
print(f"Questions answered by ALL 4 categories: {all_answered.sum()}")

# Questions with at least one missing
any_missing = ~all_answered
print(f"Questions with at least one category missing: {any_missing.sum()}")

# Show which categories have missing data
print(f"\nMissing by category:")
print(f"  Debater Direct QA missing       : {(~has_debater).sum()}")
print(f"  Judge Direct QA missing         : {(~has_judge).sum()}")
print(f"  Interactive Debate missing      : {(~has_interactive).sum()}")
print(f"  Non-Interactive Debate missing  : {(~has_non_interactive).sum()}")

print("="*80)



Question Coverage Analysis:

Questions answered by ALL 4 categories: 65
Questions with at least one category missing: 32

Missing by category:
  Debater Direct QA missing       : 0
  Judge Direct QA missing         : 0
  Interactive Debate missing      : 12
  Non-Interactive Debate missing  : 22


In [40]:
# Compare accuracy on questions that ALL categories answered vs ALL questions
print("\nAccuracy Comparison: Complete Questions Only vs All Questions\n")
print("="*80)

# For questions answered by all 4 categories
complete_df = df[all_answered]
complete_accuracies = {
    'Debater Direct QA': (complete_df['debater_direct_correct'].sum() / len(complete_df) * 100),
    'Judge Direct QA': (complete_df['judge_direct_correct'].sum() / len(complete_df) * 100),
    'Interactive Debate': (complete_df['interactive_judge_after_debate_correct'].sum() / len(complete_df) * 100),
    'Non-Interactive Debate': (complete_df['non_interactive_judge_after_debate_correct'].sum() / len(complete_df) * 100)
}

print(f"On {len(complete_df)} questions answered by ALL categories:\n")
for category, acc in complete_accuracies.items():
    print(f"  {category:35s}: {acc:5.1f}%")

print(f"\nOn ALL questions (with valid answers per category):\n")
for category, acc in accuracies.items():
    print(f"  {category:35s}: {acc:5.1f}%")

print(f"\nDifference (Complete Only - All Questions):\n")
for category in accuracies.keys():
    diff = complete_accuracies[category] - accuracies[category]
    direction = "harder" if diff < 0 else "easier"
    print(f"  {category:35s}: {diff:+5.1f}% (complete questions are {direction})")

print("="*80)



Accuracy Comparison: Complete Questions Only vs All Questions

On 65 questions answered by ALL categories:

  Debater Direct QA                  :  58.5%
  Judge Direct QA                    :  60.0%
  Interactive Debate                 :  56.9%
  Non-Interactive Debate             :  58.5%

On ALL questions (with valid answers per category):

  Debater Direct QA                  :  55.7%
  Judge Direct QA                    :  57.7%
  Judge After Interactive Debate     :  60.0%
  Judge After Non-Interactive Debate :  56.0%

Difference (Complete Only - All Questions):

  Debater Direct QA                  :  +2.8% (complete questions are easier)
  Judge Direct QA                    :  +2.3% (complete questions are easier)


KeyError: 'Judge After Interactive Debate'

In [None]:
# Show accuracy for each category on rows where ONLY that category has valid data
print("\nAccuracy on Questions Unique to Each Category:\n")
print("="*80)

categories_data = [
    ('Debater Direct QA', 'debater_direct_correct', has_debater),
    ('Judge Direct QA', 'judge_direct_correct', has_judge),
    ('Interactive Debate', 'interactive_judge_after_debate_correct', has_interactive),
    ('Non-Interactive Debate', 'non_interactive_judge_after_debate_correct', has_non_interactive)
]

for label, col, has_data in categories_data:
    # Questions where THIS category has data but at least one OTHER doesn't
    unique_questions = has_data & (~all_answered)
    if unique_questions.sum() > 0:
        unique_df = df[unique_questions]
        unique_acc = unique_df[col].sum() / len(unique_df) * 100
        print(f"{label:35s}: {unique_acc:5.1f}% accuracy on {unique_questions.sum()} questions with incomplete data")
    else:
        print(f"{label:35s}: No questions unique to this category")

print("="*80)


In [None]:
# Show correct/incorrect counts for each category (only on rows with valid answers)
print("\nCorrect vs Incorrect Breakdown (Valid Answers Only):\n")
print("="*80)

categories_breakdown = [
    ('Debater Direct QA', 'debater_direct_correct'),
    ('Judge Direct QA', 'judge_direct_correct'),
    ('Interactive Debate', 'interactive_judge_after_debate_correct'),
    ('Non-Interactive Debate', 'non_interactive_judge_after_debate_correct')
]

for label, col in categories_breakdown:
    valid_data = df[col].notna()
    correct_count = (df[valid_data][col] == True).sum()
    incorrect_count = (df[valid_data][col] == False).sum()
    total = correct_count + incorrect_count
    
    print(f"\n{label}:")
    print(f"  Correct  : {correct_count:3d} / {total:3d} ({correct_count/total*100:5.1f}%)")
    print(f"  Incorrect: {incorrect_count:3d} / {total:3d} ({incorrect_count/total*100:5.1f}%)")

print("\n" + "="*80)


In [22]:
# Compare debate turns with counts
print("\nAverage Debate Turns:\n")
print("="*60)

interactive_count = df['interactive_debate_turns'].notna().sum()
non_interactive_count = df['non_interactive_debate_turns'].notna().sum()

print(f"{'Interactive Debates':40s}: {df['interactive_debate_turns'].mean():.1f} turns (n={interactive_count})")
print(f"{'Non-Interactive Debates':40s}: {df['non_interactive_debate_turns'].mean():.1f} turns (n={non_interactive_count})")
print("="*60)



Average Debate Turns:

Interactive Debates                     : 4.5 turns (n=100)
Non-Interactive Debates                 : 4.0 turns (n=100)
