# SimpleLean Failure Analysis

Analyzing cases that failed after 3 iterations (max retries).

In [None]:
import json
import re
from collections import Counter
from pathlib import Path

In [None]:
# Load results
results_path = Path('../results/simplelean/gpt-5_folio_implicit_20251222_215452/results.jsonl')
results = [json.loads(l) for l in results_path.read_text().strip().split('\n')]
print(f'Loaded {len(results)} results')

In [None]:
# Iteration distribution
iters = Counter(r.get('num_iterations', 0) for r in results)
print('Iteration distribution:')
for i in sorted(iters.keys()):
    print(f'  {i} iter: {iters[i]} cases')

In [None]:
# Filter 3-iteration cases (max retries used)
failed_cases = [r for r in results if r.get('num_iterations') == 3]
print(f'Cases with 3 iterations: {len(failed_cases)}')

In [None]:
# Lean 3 syntax patterns to detect
lean3_patterns = [
    (r'\bconstant\b', 'constant (use opaque/axiom)'),
    (r'λ\s*\w+\s*,', 'λ x, (use fun x =>)'),
    (r'\bbegin\b', 'begin...end (use by)'),
    (r'#check\b', '#check'),
    (r'\blemma\b', 'lemma (use theorem)'),
    (r'by\s*\{', 'by { (use by with newline)'),
    (r'\bvariable\b(?!s)', 'variable (check context)'),
]

In [None]:
# Detect Lean 3 syntax in failed cases
lean3_cases = []
lean4_error_cases = []
no_code_cases = []

for r in failed_cases:
    lean_code = r.get('lean_code') or ''
    case_idx = r.get('case_idx')
    
    if not lean_code:
        no_code_cases.append(r)
        continue
    
    found_patterns = []
    for pattern, name in lean3_patterns:
        if re.search(pattern, lean_code):
            found_patterns.append(name)
    
    if found_patterns:
        lean3_cases.append({'case': r, 'patterns': found_patterns})
    else:
        lean4_error_cases.append(r)

print(f'Lean 3 syntax detected: {len(lean3_cases)}')
print(f'Lean 4 but errors: {len(lean4_error_cases)}')
print(f'No Lean code: {len(no_code_cases)}')

In [None]:
# Show Lean 3 cases
print('=== Cases with Lean 3 syntax ===')
for item in lean3_cases[:5]:
    r = item['case']
    print(f"\nCase {r.get('case_idx')}: {', '.join(item['patterns'])}")
    print(f"  Prediction: {r.get('prediction')}, GT: {r.get('ground_truth')}")
    print(f"  Code preview: {(r.get('lean_code') or '')[:200]}...")

In [None]:
# Show Lean 4 error cases
print('=== Cases with Lean 4 syntax but errors ===')
for r in lean4_error_cases[:5]:
    lv = r.get('lean_verification') or {}
    errors = lv.get('errors', [])
    print(f"\nCase {r.get('case_idx')}:")
    print(f"  Prediction: {r.get('prediction')}, GT: {r.get('ground_truth')}")
    if errors:
        print(f"  Error: {errors[0][:150]}...")

In [None]:
# Error categorization
error_types = Counter()

for r in failed_cases:
    lv = r.get('lean_verification') or {}
    errors = lv.get('errors', [])
    
    if not errors:
        error_types['no_lean_code'] += 1
        continue
    
    first_error = errors[0].lower()
    
    if 'unknown identifier' in first_error:
        error_types['unknown_identifier'] += 1
    elif 'type mismatch' in first_error:
        error_types['type_mismatch'] += 1
    elif 'failed to synthesize' in first_error:
        error_types['synthesis_failed'] += 1
    elif 'unexpected' in first_error:
        error_types['syntax_error'] += 1
    elif 'sorry' in first_error:
        error_types['uses_sorry'] += 1
    else:
        error_types['other'] += 1

print('Error type distribution:')
for err, count in error_types.most_common():
    print(f'  {err}: {count}')

In [None]:
# Overall stats
total = len(results)
correct = sum(1 for r in results if r.get('correct'))
lean_pass = sum(1 for r in results if (r.get('lean_verification') or {}).get('success'))

print(f'Total: {total}')
print(f'Correct: {correct}/{total} ({100*correct/total:.1f}%)')
print(f'Lean Pass: {lean_pass}/{total} ({100*lean_pass/total:.1f}%)')