# Error Classification Comparison (v1, v2, v3)

Compare error classifications across three prompt versions for GPT-5 FOLIO baseline false negatives.

In [None]:
import pandas as pd
import json
from pathlib import Path

# Load all three classification results
v1 = pd.read_csv('../results/error_analysis/gpt-5_folio_baseline_20251223_011844_v1.csv')
v2 = pd.read_csv('../results/error_analysis/gpt-5_folio_baseline_20251223_011844_v2.csv')
v3 = pd.read_csv('../results/error_analysis/gpt-5_folio_baseline_20251223_011844_v3.csv')

print(f"v1: {len(v1)} cases, v2: {len(v2)} cases, v3: {len(v3)} cases")

## 1. Category Distribution Comparison

In [None]:
print("=" * 60)
print("V1 Categories:")
print(v1['root_cause_category'].value_counts())
print()
print("=" * 60)
print("V2 Categories:")
print(v2['root_cause_category'].value_counts())
print()
print("=" * 60)
print("V3 Categories:")
print(v3['root_cause_category'].value_counts())

## 2. Side-by-Side Comparison

In [None]:
# Merge all three
comparison = v1[['case_idx', 'example_id', 'pattern', 'root_cause_category']].copy()
comparison.columns = ['case_idx', 'example_id', 'pattern', 'v1']
comparison['v2'] = v2['root_cause_category']
comparison['v3'] = v3['root_cause_category']

# Show all cases
pd.set_option('display.max_colwidth', 30)
comparison

## 3. V3 Cases by Category (with actual Lean code)

In [None]:
# Load responses for detailed view
responses_dir = Path('../results/simplelean/gpt-5_folio_baseline_20251223_011844/responses')

def get_response(case_idx):
    """Load response file for a case."""
    path = responses_dir / f'case_{case_idx}.txt'
    if path.exists():
        return path.read_text()
    return 'Not found'

# Add response to v3
v3['lean_response'] = v3['case_idx'].apply(get_response)

In [None]:
# Show cases by V3 category
for category in v3['root_cause_category'].unique():
    cases = v3[v3['root_cause_category'] == category]
    print("=" * 80)
    print(f"CATEGORY: {category} ({len(cases)} cases)")
    print("=" * 80)
    
    for _, row in cases.iterrows():
        print(f"\n--- Case {row['case_idx']} | {row['pattern']} ---")
        print(f"Conclusion: {row['conclusion'][:100]}...")
        print(f"Error: {row['error_description']}")
        print(f"Problematic: {row.get('problematic_axiom', 'N/A')}")
        print()

## 4. Detailed View: Gaming Categories (AXIOMATIZE_*)

In [None]:
gaming_cats = ['AXIOMATIZE_CONCLUSION', 'AXIOMATIZE_CONTRADICTION', 'AXIOMATIZE_FABRICATION']
gaming_cases = v3[v3['root_cause_category'].isin(gaming_cats)]

print(f"Gaming cases: {len(gaming_cases)}")
print()

for _, row in gaming_cases.iterrows():
    print("=" * 80)
    print(f"Case {row['case_idx']} | {row['root_cause_category']} | {row['pattern']}")
    print("=" * 80)
    print(f"\nConclusion: {row['conclusion']}")
    print(f"\nError: {row['error_description']}")
    print(f"\n--- Lean Code ---")
    print(row['lean_response'][:2000])
    print()

## 5. Disagreements Between Versions

In [None]:
# Find cases where v1, v2, v3 disagree
def normalize_category(cat):
    """Map similar categories across versions."""
    mappings = {
        'REASONING_FAILURE': 'REASONING',
        'REASONING_GAP': 'REASONING',
        'PROOF_INCOMPLETE': 'REASONING',
        'AXIOMATIZES_CONCLUSION': 'AXIOM_CONCLUSION',
        'AXIOMATIZE_CONCLUSION': 'AXIOM_CONCLUSION',
        'AXIOMATIZES_CONTRADICTION': 'AXIOM_CONTRADICTION',
        'AXIOMATIZE_CONTRADICTION': 'AXIOM_CONTRADICTION',
        'AXIOMATIZES_UNMENTIONED': 'AXIOM_FABRICATION',
        'FABRICATES_ENTITY_FACT': 'AXIOM_FABRICATION',
        'AXIOMATIZE_FABRICATION': 'AXIOM_FABRICATION',
        'INCORRECT_FORMALIZATION': 'FORMALIZATION',
        'FORMALIZE_INCORRECTLY': 'FORMALIZATION',
        'FORMALIZE_INCOMPLETE': 'FORMALIZATION',
    }
    return mappings.get(cat, cat)

comparison['v1_norm'] = comparison['v1'].apply(normalize_category)
comparison['v2_norm'] = comparison['v2'].apply(normalize_category)
comparison['v3_norm'] = comparison['v3'].apply(normalize_category)

comparison['all_agree'] = (comparison['v1_norm'] == comparison['v2_norm']) & (comparison['v2_norm'] == comparison['v3_norm'])

print(f"Agreement: {comparison['all_agree'].sum()}/{len(comparison)} cases")
print()
print("Disagreements:")
comparison[~comparison['all_agree']][['case_idx', 'pattern', 'v1', 'v2', 'v3']]

## 6. Summary Stats

In [None]:
print("Category Mapping Summary:")
print()
print("Gaming (AXIOMATIZE_*):")
print(f"  v1: {(v1['root_cause_category'].str.startswith('AXIOM')).sum()}")
print(f"  v2: {(v2['root_cause_category'].str.startswith('AXIOM')).sum()}")
print(f"  v3: {(v3['root_cause_category'].str.startswith('AXIOM')).sum()}")
print()
print("Reasoning/Proof Issues:")
print(f"  v1 (REASONING_FAILURE): {(v1['root_cause_category'] == 'REASONING_FAILURE').sum()}")
print(f"  v2 (REASONING_GAP): {(v2['root_cause_category'] == 'REASONING_GAP').sum()}")
print(f"  v3 (PROOF_INCOMPLETE): {(v3['root_cause_category'] == 'PROOF_INCOMPLETE').sum()}")
print()
print("Formalization Issues:")
print(f"  v1 (INCORRECT_FORMALIZATION): {(v1['root_cause_category'] == 'INCORRECT_FORMALIZATION').sum()}")
print(f"  v2 (INCORRECT_FORMALIZATION): {(v2['root_cause_category'] == 'INCORRECT_FORMALIZATION').sum()}")
print(f"  v3 (FORMALIZE_*): {v3['root_cause_category'].str.startswith('FORMALIZE').sum()}")