# SimpleLean Failure Analysis

Analyzing cases that failed after 3 iterations (max retries).

In [1]:
import json
import re
from collections import Counter
from pathlib import Path

In [2]:
# Load results
results_path = Path('../results/simplelean/gpt-5_folio_implicit_20251222_215452/results.jsonl')
results = [json.loads(l) for l in results_path.read_text().strip().split('\n')]
print(f'Loaded {len(results)} results')

Loaded 91 results


In [3]:
# Iteration distribution
iters = Counter(r.get('num_iterations', 0) for r in results)
print('Iteration distribution:')
for i in sorted(iters.keys()):
    print(f'  {i} iter: {iters[i]} cases')

Iteration distribution:
  1 iter: 8 cases
  2 iter: 28 cases
  3 iter: 55 cases


In [4]:
# Filter 3-iteration cases (max retries used)
failed_cases = [r for r in results if r.get('num_iterations') == 3]
print(f'Cases with 3 iterations: {len(failed_cases)}')

Cases with 3 iterations: 55


In [5]:
# Lean 3 syntax patterns to detect
lean3_patterns = [
    (r'\bconstant\b', 'constant (use opaque/axiom)'),
    (r'λ\s*\w+\s*,', 'λ x, (use fun x =>)'),
    (r'\bbegin\b', 'begin...end (use by)'),
    (r'#check\b', '#check'),
    (r'\blemma\b', 'lemma (use theorem)'),
    (r'by\s*\{', 'by { (use by with newline)'),
    (r'\bvariable\b(?!s)', 'variable (check context)'),
]

In [6]:
# Detect Lean 3 syntax in failed cases
lean3_cases = []
lean4_error_cases = []
no_code_cases = []

for r in failed_cases:
    lean_code = r.get('lean_code') or ''
    case_idx = r.get('case_idx')
    
    if not lean_code:
        no_code_cases.append(r)
        continue
    
    found_patterns = []
    for pattern, name in lean3_patterns:
        if re.search(pattern, lean_code):
            found_patterns.append(name)
    
    if found_patterns:
        lean3_cases.append({'case': r, 'patterns': found_patterns})
    else:
        lean4_error_cases.append(r)

print(f'Lean 3 syntax detected: {len(lean3_cases)}')
print(f'Lean 4 but errors: {len(lean4_error_cases)}')
print(f'No Lean code: {len(no_code_cases)}')

Lean 3 syntax detected: 37
Lean 4 but errors: 17
No Lean code: 1


In [7]:
# Show Lean 3 cases
print('=== Cases with Lean 3 syntax ===')
for item in lean3_cases[:5]:
    r = item['case']
    print(f"\nCase {r.get('case_idx')}: {', '.join(item['patterns'])}")
    print(f"  Prediction: {r.get('prediction')}, GT: {r.get('ground_truth')}")
    print(f"  Code preview: {(r.get('lean_code') or '')[:200]}...")

=== Cases with Lean 3 syntax ===

Case 35: constant (use opaque/axiom)
  Prediction: False, GT: False
  Code preview: constant Entity : Type

constant IsStable : Entity → Prop
constant Includes : Entity → Entity → Prop
constant Feud : Entity → Entity → Prop

constant DiamondMine : Entity
constant Imperium : Entity
co...

Case 106: variable (check context)
  Prediction: False, GT: False
  Code preview: universe u

section

-- Types
variable (Person Entity : Type u)

-- Distinguished entities/persons
variable (ETS : Entity)
variable (Tom : Person)

-- Predicates
variable (Applicant TakingGRE LivesSin...

Case 108: constant (use opaque/axiom)
  Prediction: False, GT: False
  Code preview: constant Person : Type
constant Spill : Person → Prop
constant Tidy : Person → Prop
constant Clean : Person → Prop
constant ValueOrder : Person → Prop
constant FamilyPrior : Person → Prop
constant Clu...

Case 38: constant (use opaque/axiom)
  Prediction: False, GT: False
  Code preview: constant Person 

In [8]:
# Show Lean 4 error cases
print('=== Cases with Lean 4 syntax but errors ===')
for r in lean4_error_cases[:5]:
    lv = r.get('lean_verification') or {}
    errors = lv.get('errors', [])
    print(f"\nCase {r.get('case_idx')}:")
    print(f"  Prediction: {r.get('prediction')}, GT: {r.get('ground_truth')}")
    if errors:
        print(f"  Error: {errors[0][:150]}...")

=== Cases with Lean 4 syntax but errors ===

Case 36:
  Prediction: Unknown, GT: True
  Error: declaration `beethoven_is_conductor` contains universe level metavariables at the expression
  orchestras_led_by_conductors.{u_1, ?u.115} Beethoven Vi...

Case 185:
  Prediction: False, GT: False

Case 41:
  Prediction: True, GT: False

Case 187:
  Prediction: Unknown, GT: Uncertain

Case 189:
  Prediction: False, GT: False


In [9]:
# Error categorization
error_types = Counter()

for r in failed_cases:
    lv = r.get('lean_verification') or {}
    errors = lv.get('errors', [])
    
    if not errors:
        error_types['no_lean_code'] += 1
        continue
    
    first_error = errors[0].lower()
    
    if 'unknown identifier' in first_error:
        error_types['unknown_identifier'] += 1
    elif 'type mismatch' in first_error:
        error_types['type_mismatch'] += 1
    elif 'failed to synthesize' in first_error:
        error_types['synthesis_failed'] += 1
    elif 'unexpected' in first_error:
        error_types['syntax_error'] += 1
    elif 'sorry' in first_error:
        error_types['uses_sorry'] += 1
    else:
        error_types['other'] += 1

print('Error type distribution:')
for err, count in error_types.most_common():
    print(f'  {err}: {count}')

Error type distribution:
  syntax_error: 32
  no_lean_code: 16
  other: 6
  unknown_identifier: 1


In [10]:
# Overall stats
total = len(results)
correct = sum(1 for r in results if r.get('correct'))
lean_pass = sum(1 for r in results if (r.get('lean_verification') or {}).get('success'))

print(f'Total: {total}')
print(f'Correct: {correct}/{total} ({100*correct/total:.1f}%)')
print(f'Lean Pass: {lean_pass}/{total} ({100*lean_pass/total:.1f}%)')

Total: 91
Correct: 50/91 (54.9%)
Lean Pass: 51/91 (56.0%)


## Prompt Variants Comparison

Testing improved prompts with Lean 4 examples on the 55 cases that failed 3 iterations.

### Prompt Differences:
- **lean4_specified**: Lean 4 syntax rules + True example only
- **lean4_balanced**: Lean 4 syntax + True AND Uncertain examples
- **lean4_minimal**: Same as balanced but WITHOUT "CRITICAL RULES" section

### Verified Lean 4 Example (added to prompts):

```lean
-- Example 1 (True): "The cat is blue. If someone is blue then they are nice."
axiom obj : Type
axiom Cat : obj
axiom Blue : obj → Prop
axiom Nice : obj → Prop
axiom T1 : Blue Cat
axiom R1 : ∀ x : obj, Blue x → Nice x
theorem cat_nice : Nice Cat := R1 Cat T1

-- Example 2 (Uncertain): "The cat is blue. If someone is nice then they are red."
axiom Red : obj → Prop
axiom R2 : ∀ x : obj, Nice x → Red x
-- Cannot prove Red Cat or ¬ Red Cat from given axioms
```

Key syntax differences fixed:
- `axiom` instead of `constant` (Lean 3)
- `fun x =>` instead of `λ x,` (Lean 3)
- `by` with tactics instead of `begin...end` (Lean 3)

In [None]:
# Load and compare all prompt variants on 55 failed cases

def load_and_analyze(path):
    """Load results and compute metrics with Unknown→Uncertain normalization."""
    results = [json.loads(l) for l in Path(path).read_text().strip().split('\n')]
    
    def normalize(ans):
        return 'Uncertain' if ans == 'Unknown' else ans
    
    total = len(results)
    correct = sum(1 for r in results 
                  if normalize(r.get('prediction')) == r.get('ground_truth'))
    lean_pass = sum(1 for r in results 
                    if (r.get('lean_verification') or {}).get('success'))
    # False negatives: lean passed but wrong answer
    false_neg = sum(1 for r in results 
                    if (r.get('lean_verification') or {}).get('success')
                    and normalize(r.get('prediction')) != r.get('ground_truth'))
    
    return {
        'total': total,
        'correct': correct,
        'lean_pass': lean_pass,
        'false_neg': false_neg,
        'accuracy': correct / total,
        'lean_rate': lean_pass / total,
        'false_neg_rate': false_neg / lean_pass if lean_pass > 0 else 0
    }

# Prompt variants
prompts = {
    'lean4_specified': '../results/simplelean/gpt-5_folio_lean4_specified_20251222_230613/results.jsonl',
    'lean4_balanced': '../results/simplelean/gpt-5_folio_lean4_balanced_20251222_231818/results.jsonl',
    'lean4_minimal': '../results/simplelean/gpt-5_folio_lean4_minimal_20251222_233118/results.jsonl',
}

print('=' * 70)
print('PROMPT COMPARISON ON 55 FAILED CASES')
print('=' * 70)
print(f'{"Prompt":<20} {"Accuracy":<15} {"Lean Pass":<15} {"False Neg Rate"}')
print('-' * 70)

for name, path in prompts.items():
    m = load_and_analyze(path)
    print(f'{name:<20} {m["correct"]}/{m["total"]} ({m["accuracy"]*100:.1f}%)'
          f'    {m["lean_pass"]}/{m["total"]} ({m["lean_rate"]*100:.1f}%)'
          f'    {m["false_neg"]}/{m["lean_pass"]} ({m["false_neg_rate"]*100:.1f}%)')

print('=' * 70)