# Lean vs CoT Analysis

Analyzing what fails with Lean vs CoT on legacy results:
1. **FOLIO Dataset** (Oct 6, 2025)
2. **Multi-LogiEval Dataset** (Oct 16, 2025)

In [None]:
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
print("✓ Imports loaded")

---
## PART 1: FOLIO Dataset Analysis
---

In [None]:
# Load FOLIO legacy results and flatten nested structure
def flatten_folio_results(stories_list):
    """Flatten FOLIO nested structure: stories -> results array"""
    flattened = []
    for story in stories_list:
        story_id = story.get('story_id')
        premises = story.get('premises', '')
        for result in story.get('results', []):
            flattened.append({
                'story_id': story_id,
                'premises': premises,
                'example_id': result.get('example_id'),
                'question_num': result.get('question_num'),
                'conclusion': result.get('conclusion', ''),
                'ground_truth': result.get('ground_truth'),
                'prediction': result.get('prediction'),
                'correct': result.get('correct')
            })
    return flattened

with open('results/legacy/cot_folio_responses_20251006_074511.json') as f:
    folio_cot_raw = json.load(f)
    folio_cot = flatten_folio_results(folio_cot_raw)

with open('results/legacy/lean_folio_responses_20251006_074856.json') as f:
    folio_lean_raw = json.load(f)
    folio_lean = flatten_folio_results(folio_lean_raw)

with open('results/legacy/leaninteract_folio_results_20251006_095029.json') as f:
    folio_lean_int_raw = json.load(f)
    folio_lean_int = flatten_folio_results(folio_lean_int_raw)

print(f"Loaded FOLIO results (flattened):")
print(f"  CoT: {len(folio_cot)} questions")
print(f"  Lean: {len(folio_lean)} questions")
print(f"  Lean Interactive: {len(folio_lean_int)} questions")

In [None]:
# Align FOLIO results by example_id
def align_folio(cot_list, lean_list):
    # Index by example_id (unique identifier for each question)
    cot_dict = {r['example_id']: r for r in cot_list if r.get('example_id')}
    lean_dict = {r['example_id']: r for r in lean_list if r.get('example_id')}
    
    common_ids = set(cot_dict.keys()) & set(lean_dict.keys())
    
    aligned = []
    for qid in sorted(common_ids):
        aligned.append({
            'example_id': qid,
            'cot': cot_dict[qid],
            'lean': lean_dict[qid]
        })
    
    return aligned

folio_aligned = align_folio(folio_cot, folio_lean)
print(f"\n✓ Aligned {len(folio_aligned)} FOLIO questions")

In [None]:
# Calculate accuracy with proper normalization
def normalize_answer(value):
    """Normalize various answer formats"""
    if not value:
        return None
    val = str(value).lower().strip()
    # Handle FOLIO format: True/False/Uncertain
    if val in ['true', 't', 'yes', 'y']:
        return 'true'
    elif val in ['false', 'f', 'no', 'n']:
        return 'false'
    elif val in ['uncertain', 'unknown', 'u']:
        return 'uncertain'
    return val

def get_accuracy(results):
    if not results:
        return 0.0
    correct = 0
    total = 0
    for r in results:
        # Check different field names
        gt = normalize_answer(r.get('ground_truth', r.get('label', r.get('answer', ''))))
        pred = normalize_answer(r.get('prediction', r.get('predicted_label', r.get('model_answer', ''))))
        
        if gt and pred:
            total += 1
            if gt == pred:
                correct += 1
    
    return (correct / total * 100) if total > 0 else 0.0

folio_cot_acc = get_accuracy(folio_cot)
folio_lean_acc = get_accuracy(folio_lean)
folio_lean_int_acc = get_accuracy(folio_lean_int)

print("="*70)
print("FOLIO ACCURACY")
print("="*70)
print(f"CoT:               {folio_cot_acc:>6.2f}%")
print(f"Lean:              {folio_lean_acc:>6.2f}%")
print(f"Lean Interactive:  {folio_lean_int_acc:>6.2f}%")
print("="*70)

In [None]:
# Categorize FOLIO performance patterns
def is_correct(result):
    gt = normalize_answer(result.get('ground_truth', result.get('label', result.get('answer', ''))))
    pred = normalize_answer(result.get('prediction', result.get('predicted_label', result.get('model_answer', ''))))
    return gt == pred if (gt and pred) else None

folio_both_correct = []
folio_both_wrong = []
folio_cot_only = []  # CoT correct, Lean wrong
folio_lean_only = []  # Lean correct, CoT wrong

for item in folio_aligned:
    cot_ok = is_correct(item['cot'])
    lean_ok = is_correct(item['lean'])
    
    if cot_ok is None or lean_ok is None:
        continue
    
    if cot_ok and lean_ok:
        folio_both_correct.append(item)
    elif not cot_ok and not lean_ok:
        folio_both_wrong.append(item)
    elif cot_ok and not lean_ok:
        folio_cot_only.append(item)
    elif not cot_ok and lean_ok:
        folio_lean_only.append(item)

print("FOLIO PERFORMANCE PATTERNS")
print("="*70)
print(f"Both correct:          {len(folio_both_correct):>4}")
print(f"Both wrong:            {len(folio_both_wrong):>4}")
print(f"CoT ✓, Lean ✗:         {len(folio_cot_only):>4} ← CoT succeeds where Lean fails")
print(f"Lean ✓, CoT ✗:         {len(folio_lean_only):>4} ← Lean succeeds where CoT fails")
print("="*70)

In [None]:
# Show FOLIO examples where CoT succeeds but Lean fails
print("\n" + "="*70)
print(f"FOLIO: {len(folio_cot_only)} QUESTIONS WHERE COT SUCCEEDS BUT LEAN FAILS")
print("="*70)

for i, item in enumerate(folio_cot_only[:5], 1):
    cot_r = item['cot']
    lean_r = item['lean']
    
    print(f"\nExample {i}:")
    print(f"  Example ID: {cot_r.get('example_id')}")
    print(f"  Story ID: {cot_r.get('story_id')}")
    print(f"  Premises: {cot_r.get('premises', '')[:150]}...")
    print(f"  Conclusion: {cot_r.get('conclusion', '')[:150]}...")
    
    print(f"  Ground Truth: {cot_r.get('ground_truth')}")
    print(f"  CoT:  {cot_r.get('prediction')} ✓")
    print(f"  Lean: {lean_r.get('prediction')} ✗")

if len(folio_cot_only) > 5:
    print(f"\n... and {len(folio_cot_only) - 5} more")

In [None]:
# Show FOLIO examples where Lean succeeds but CoT fails
print("\n" + "="*70)
print(f"FOLIO: {len(folio_lean_only)} QUESTIONS WHERE LEAN SUCCEEDS BUT COT FAILS")
print("="*70)

for i, item in enumerate(folio_lean_only[:5], 1):
    cot_r = item['cot']
    lean_r = item['lean']
    
    print(f"\nExample {i}:")
    print(f"  Example ID: {cot_r.get('example_id')}")
    print(f"  Story ID: {cot_r.get('story_id')}")
    print(f"  Premises: {cot_r.get('premises', '')[:150]}...")
    print(f"  Conclusion: {cot_r.get('conclusion', '')[:150]}...")
    
    print(f"  Ground Truth: {cot_r.get('ground_truth')}")
    print(f"  CoT:  {cot_r.get('prediction')} ✗")
    print(f"  Lean: {lean_r.get('prediction')} ✓")

if len(folio_lean_only) > 5:
    print(f"\n... and {len(folio_lean_only) - 5} more")

In [None]:
# Visualize FOLIO results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Accuracy comparison
ax1.bar(['CoT', 'Lean', 'Lean Interactive'], 
        [folio_cot_acc, folio_lean_acc, folio_lean_int_acc],
        color=['#3498db', '#e74c3c', '#2ecc71'])
ax1.set_ylabel('Accuracy (%)')
ax1.set_title('FOLIO: Accuracy Comparison', fontweight='bold')
ax1.set_ylim([0, 100])

# Performance patterns
labels = ['Both\nCorrect', 'Both\nWrong', 'CoT Only', 'Lean Only']
sizes = [len(folio_both_correct), len(folio_both_wrong), 
         len(folio_cot_only), len(folio_lean_only)]
colors = ['#2ecc71', '#e74c3c', '#3498db', '#f39c12']

ax2.pie(sizes, labels=labels, colors=colors, autopct='%1.0f%%', startangle=90)
ax2.set_title('FOLIO: Performance Patterns', fontweight='bold')

plt.tight_layout()
plt.show()

---
## PART 2: Multi-LogiEval Dataset Analysis
---

In [None]:
# Load Multi-LogiEval legacy results
with open('results/legacy/multilogieval_zero_shot_cot_20251016_115317/all_responses.json') as f:
    mlogi_cot = json.load(f)

with open('results/legacy/multilogieval_lean_lean_test_20251016_111656/all_results.json') as f:
    mlogi_lean = json.load(f)

print(f"Loaded Multi-LogiEval results:")
print(f"  CoT: {len(mlogi_cot)} questions")
print(f"  Lean: {len(mlogi_lean)} questions")

In [None]:
# Align Multi-LogiEval results by context + question
def align_mlogi(cot_list, lean_list):
    # Index by context + question
    cot_dict = {}
    for r in cot_list:
        key = f"{r.get('logic_type', '')}_{r.get('depth', '')}_{r.get('context', '')[:50]}"
        cot_dict[key] = r
    
    lean_dict = {}
    for r in lean_list:
        key = f"{r.get('logic_type', '')}_{r.get('depth', '')}_{r.get('context', '')[:50]}"
        lean_dict[key] = r
    
    common_ids = set(cot_dict.keys()) & set(lean_dict.keys())
    
    aligned = []
    for qid in common_ids:
        aligned.append({
            'question_id': qid,
            'cot': cot_dict[qid],
            'lean': lean_dict[qid]
        })
    
    return aligned

mlogi_aligned = align_mlogi(mlogi_cot, mlogi_lean)
print(f"\n✓ Aligned {len(mlogi_aligned)} Multi-LogiEval questions")

In [None]:
# Calculate Multi-LogiEval accuracy
mlogi_cot_acc = get_accuracy(mlogi_cot)
mlogi_lean_acc = get_accuracy(mlogi_lean)

print("="*70)
print("MULTI-LOGIEVAL ACCURACY")
print("="*70)
print(f"CoT:   {mlogi_cot_acc:>6.2f}%")
print(f"Lean:  {mlogi_lean_acc:>6.2f}%")
print("="*70)

In [None]:
# Categorize Multi-LogiEval performance patterns
mlogi_both_correct = []
mlogi_both_wrong = []
mlogi_cot_only = []  # CoT correct, Lean wrong
mlogi_lean_only = []  # Lean correct, CoT wrong

for item in mlogi_aligned:
    cot_ok = is_correct(item['cot'])
    lean_ok = is_correct(item['lean'])
    
    if cot_ok is None or lean_ok is None:
        continue
    
    if cot_ok and lean_ok:
        mlogi_both_correct.append(item)
    elif not cot_ok and not lean_ok:
        mlogi_both_wrong.append(item)
    elif cot_ok and not lean_ok:
        mlogi_cot_only.append(item)
    elif not cot_ok and lean_ok:
        mlogi_lean_only.append(item)

print("MULTI-LOGIEVAL PERFORMANCE PATTERNS")
print("="*70)
print(f"Both correct:          {len(mlogi_both_correct):>4}")
print(f"Both wrong:            {len(mlogi_both_wrong):>4}")
print(f"CoT ✓, Lean ✗:         {len(mlogi_cot_only):>4} ← CoT succeeds where Lean fails")
print(f"Lean ✓, CoT ✗:         {len(mlogi_lean_only):>4} ← Lean succeeds where CoT fails")
print("="*70)

In [None]:
# Show Multi-LogiEval examples where CoT succeeds but Lean fails
print("\n" + "="*70)
print(f"MULTI-LOGIEVAL: {len(mlogi_cot_only)} QUESTIONS WHERE COT SUCCEEDS BUT LEAN FAILS")
print("="*70)

for i, item in enumerate(mlogi_cot_only[:5], 1):
    print(f"\nExample {i}:")
    cot_r = item['cot']
    lean_r = item['lean']
    
    print(f"  Logic Type: {cot_r.get('logic_type', 'N/A')}")
    print(f"  Depth: {cot_r.get('depth', 'N/A')}")
    if 'context' in cot_r:
        print(f"  Context: {cot_r['context'][:150]}...")
    if 'question' in cot_r:
        print(f"  Question: {cot_r['question'][:150]}...")
    
    truth = cot_r.get('ground_truth', cot_r.get('label', 'N/A'))
    cot_pred = cot_r.get('prediction', cot_r.get('predicted_label', 'N/A'))
    lean_pred = lean_r.get('prediction', lean_r.get('predicted_label', 'N/A'))
    
    print(f"  Ground Truth: {truth}")
    print(f"  CoT:  {cot_pred} ✓")
    print(f"  Lean: {lean_pred} ✗")

if len(mlogi_cot_only) > 5:
    print(f"\n... and {len(mlogi_cot_only) - 5} more")

In [None]:
# Show Multi-LogiEval examples where Lean succeeds but CoT fails
print("\n" + "="*70)
print(f"MULTI-LOGIEVAL: {len(mlogi_lean_only)} QUESTIONS WHERE LEAN SUCCEEDS BUT COT FAILS")
print("="*70)

for i, item in enumerate(mlogi_lean_only[:5], 1):
    print(f"\nExample {i}:")
    cot_r = item['cot']
    lean_r = item['lean']
    
    print(f"  Logic Type: {cot_r.get('logic_type', 'N/A')}")
    print(f"  Depth: {cot_r.get('depth', 'N/A')}")
    if 'context' in cot_r:
        print(f"  Context: {cot_r['context'][:150]}...")
    if 'question' in cot_r:
        print(f"  Question: {cot_r['question'][:150]}...")
    
    truth = cot_r.get('ground_truth', cot_r.get('label', 'N/A'))
    cot_pred = cot_r.get('prediction', cot_r.get('predicted_label', 'N/A'))
    lean_pred = lean_r.get('prediction', lean_r.get('predicted_label', 'N/A'))
    
    print(f"  Ground Truth: {truth}")
    print(f"  CoT:  {cot_pred} ✗")
    print(f"  Lean: {lean_pred} ✓")

if len(mlogi_lean_only) > 5:
    print(f"\n... and {len(mlogi_lean_only) - 5} more")

In [None]:
# Visualize Multi-LogiEval results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Accuracy comparison
ax1.bar(['CoT', 'Lean'], 
        [mlogi_cot_acc, mlogi_lean_acc],
        color=['#3498db', '#e74c3c'])
ax1.set_ylabel('Accuracy (%)')
ax1.set_title('Multi-LogiEval: Accuracy Comparison', fontweight='bold')
ax1.set_ylim([0, 100])

# Performance patterns
labels = ['Both\nCorrect', 'Both\nWrong', 'CoT Only', 'Lean Only']
sizes = [len(mlogi_both_correct), len(mlogi_both_wrong), 
         len(mlogi_cot_only), len(mlogi_lean_only)]
colors = ['#2ecc71', '#e74c3c', '#3498db', '#f39c12']

ax2.pie(sizes, labels=labels, colors=colors, autopct='%1.0f%%', startangle=90)
ax2.set_title('Multi-LogiEval: Performance Patterns', fontweight='bold')

plt.tight_layout()
plt.show()

---
## Export Results
---

In [None]:
# Export findings
os.makedirs('results/analysis', exist_ok=True)

# FOLIO exports
with open('results/analysis/folio_cot_succeeds_lean_fails.json', 'w') as f:
    json.dump([item['cot'] for item in folio_cot_only], f, indent=2)

with open('results/analysis/folio_lean_succeeds_cot_fails.json', 'w') as f:
    json.dump([item['cot'] for item in folio_lean_only], f, indent=2)

# Multi-LogiEval exports
with open('results/analysis/mlogi_cot_succeeds_lean_fails.json', 'w') as f:
    json.dump([item['cot'] for item in mlogi_cot_only], f, indent=2)

with open('results/analysis/mlogi_lean_succeeds_cot_fails.json', 'w') as f:
    json.dump([item['cot'] for item in mlogi_lean_only], f, indent=2)

# Summary
summary = {
    'folio': {
        'cot_acc': folio_cot_acc,
        'lean_acc': folio_lean_acc,
        'lean_int_acc': folio_lean_int_acc,
        'both_correct': len(folio_both_correct),
        'both_wrong': len(folio_both_wrong),
        'cot_only': len(folio_cot_only),
        'lean_only': len(folio_lean_only)
    },
    'multilogi': {
        'cot_acc': mlogi_cot_acc,
        'lean_acc': mlogi_lean_acc,
        'both_correct': len(mlogi_both_correct),
        'both_wrong': len(mlogi_both_wrong),
        'cot_only': len(mlogi_cot_only),
        'lean_only': len(mlogi_lean_only)
    }
}

with open('results/analysis/summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("✓ Exported all analysis results to results/analysis/")
print(f"  - folio_cot_succeeds_lean_fails.json ({len(folio_cot_only)} questions)")
print(f"  - folio_lean_succeeds_cot_fails.json ({len(folio_lean_only)} questions)")
print(f"  - mlogi_cot_succeeds_lean_fails.json ({len(mlogi_cot_only)} questions)")
print(f"  - mlogi_lean_succeeds_cot_fails.json ({len(mlogi_lean_only)} questions)")
print(f"  - summary.json")

---
## Summary
---

In [None]:
print("="*80)
print("FINAL SUMMARY")
print("="*80)
print()
print("FOLIO Dataset:")
print(f"  - CoT accuracy: {folio_cot_acc:.1f}%")
print(f"  - Lean accuracy: {folio_lean_acc:.1f}%")
print(f"  - Questions where CoT succeeds but Lean fails: {len(folio_cot_only)}")
print(f"  - Questions where Lean succeeds but CoT fails: {len(folio_lean_only)}")
print()
print("Multi-LogiEval Dataset:")
print(f"  - CoT accuracy: {mlogi_cot_acc:.1f}%")
print(f"  - Lean accuracy: {mlogi_lean_acc:.1f}%")
print(f"  - Questions where CoT succeeds but Lean fails: {len(mlogi_cot_only)}")
print(f"  - Questions where Lean succeeds but CoT fails: {len(mlogi_lean_only)}")
print()
print("="*80)