# SimpleLean Baseline Analysis

Analyzing GPT-5 baseline results on FOLIO dataset (203 cases).

In [1]:
import json
import re
from collections import Counter
from pathlib import Path

In [None]:
# Load baseline results
results_path = Path('../results/simplelean/gpt-5_folio_baseline_20251223_000248/results.jsonl')
results = [json.loads(l) for l in results_path.read_text().strip().split('\n')]
print(f'Loaded {len(results)} results')

# Quick summary
correct = sum(1 for r in results if r.get('correct'))
lean_pass = sum(1 for r in results if (r.get('lean_verification') or {}).get('success'))
print(f'Accuracy: {correct}/{len(results)} ({100*correct/len(results):.1f}%)')
print(f'Lean Pass: {lean_pass}/{len(results)} ({100*lean_pass/len(results):.1f}%)')

In [None]:
# Error analysis by ground truth
from collections import defaultdict

def normalize(ans):
    return 'Uncertain' if ans == 'Unknown' else ans

# Group by ground truth
by_gt = defaultdict(list)
for r in results:
    by_gt[r.get('ground_truth')].append(r)

print('Accuracy by Ground Truth:')
print('=' * 50)
for gt in ['True', 'False', 'Uncertain']:
    cases = by_gt[gt]
    correct = sum(1 for r in cases if normalize(r.get('prediction')) == gt)
    print(f'{gt:12} {correct}/{len(cases)} ({100*correct/len(cases):.1f}%)')

In [None]:
# Error patterns (GT → Prediction)
error_patterns = Counter()
for r in results:
    gt = r.get('ground_truth')
    pred = normalize(r.get('prediction'))
    if gt != pred:
        error_patterns[f'{gt} → {pred}'] += 1

print('Error Patterns:')
print('=' * 50)
for pattern, count in error_patterns.most_common():
    print(f'{pattern:25} {count} cases')

In [None]:
# Gaming detection: Lean passed but wrong definitive answer
gaming_cases = []
for r in results:
    gt = r.get('ground_truth')
    pred = normalize(r.get('prediction'))
    lean_pass = (r.get('lean_verification') or {}).get('success')
    
    # Gaming: proved opposite (True→False or False→True)
    if lean_pass and ((gt == 'True' and pred == 'False') or (gt == 'False' and pred == 'True')):
        gaming_cases.append(r)

print(f'Gaming cases (Lean passed but proved opposite): {len(gaming_cases)}')
for r in gaming_cases:
    print(f"  Case {r.get('case_idx')}: {r.get('ground_truth')} → {normalize(r.get('prediction'))}")

In [None]:
# False negatives: Lean passed but wrong answer (conservative errors)
false_neg_uncertain = []  # Said Uncertain when should have proved something
for r in results:
    gt = r.get('ground_truth')
    pred = normalize(r.get('prediction'))
    lean_pass = (r.get('lean_verification') or {}).get('success')
    
    if lean_pass and pred == 'Uncertain' and gt in ['True', 'False']:
        false_neg_uncertain.append(r)

print(f'Conservative errors (Uncertain when GT was True/False): {len(false_neg_uncertain)}')
by_gt = Counter(r.get('ground_truth') for r in false_neg_uncertain)
for gt, count in by_gt.most_common():
    print(f'  {gt} → Uncertain: {count} cases')

In [None]:
# Iteration distribution
iters = Counter(r.get('num_iterations', 0) for r in results)
print('Iteration distribution:')
for i in sorted(iters.keys()):
    pct = 100 * iters[i] / len(results)
    print(f'  {i} iter: {iters[i]} cases ({pct:.1f}%)')

# Success rate by iteration count
print('\nSuccess rate by iteration:')
for i in sorted(iters.keys()):
    cases = [r for r in results if r.get('num_iterations') == i]
    correct = sum(1 for r in cases if r.get('correct'))
    print(f'  {i} iter: {correct}/{len(cases)} ({100*correct/len(cases):.1f}%)')

In [None]:
# Cases that failed Lean verification
lean_failed = [r for r in results if not (r.get('lean_verification') or {}).get('success')]
print(f'Lean verification failed: {len(lean_failed)} cases')

# Show error types
error_types = Counter()
for r in lean_failed:
    lv = r.get('lean_verification') or {}
    errors = lv.get('errors', [])
    if errors:
        first_error = errors[0].lower()[:100]
        if 'unknown identifier' in first_error:
            error_types['unknown_identifier'] += 1
        elif 'type mismatch' in first_error:
            error_types['type_mismatch'] += 1
        elif 'unexpected' in first_error:
            error_types['syntax_error'] += 1
        else:
            error_types['other'] += 1
    else:
        error_types['no_code'] += 1

print('\nError types:')
for err, count in error_types.most_common():
    print(f'  {err}: {count}')

In [None]:
# Sample failed cases
failed = [r for r in results if not r.get('correct')]
print(f'Total failed cases: {len(failed)}')

# Show a few examples
for r in failed[:3]:
    print(f"\nCase {r.get('case_idx')}: {r.get('ground_truth')} → {normalize(r.get('prediction'))}")
    print(f"  Lean Pass: {(r.get('lean_verification') or {}).get('success')}")
    print(f"  Iterations: {r.get('num_iterations')}")

In [None]:
# Summary statistics
print('=' * 60)
print('BASELINE RESULTS SUMMARY')
print('=' * 60)
print(f'Total cases: {len(results)}')
print(f'Accuracy: {correct}/{len(results)} ({100*correct/len(results):.1f}%)')
print(f'Lean Pass: {lean_pass}/{len(results)} ({100*lean_pass/len(results):.1f}%)')
print()
print('Errors breakdown:')
print(f'  Gaming (proved opposite): {len(gaming_cases)}')
print(f'  Conservative (Uncertain when T/F): {len(false_neg_uncertain)}')
print(f'  Lean failed: {len(lean_failed)}')

# Save failed case indices for retry experiments
failed_indices = [r.get('case_idx') for r in failed]
print(f'Failed case indices ({len(failed_indices)} cases):')
print(failed_indices)