# DeepSeek-R1 False Positives Analysis

False Positive = Lean verification passed but answer is wrong

Excluding stories 368 & 435 (cases 75, 76, 77, 156, 157, 158, 159) due to contradictory premises.

In [None]:
import json
import pandas as pd

exclude_cases = {75, 76, 77, 156, 157, 158, 159}

def load_results(path):
    with open(path) as f:
        return [json.loads(l) for l in f]

baseline = load_results('../results/simplelean/deepseek-r1_folio_baseline/results.jsonl')
bidir_true = load_results('../results/simplelean/deepseek-r1_folio_bidir_true/results.jsonl')
bidir_false = load_results('../results/simplelean/deepseek-r1_folio_bidir_false/results.jsonl')

In [None]:
def analyze(results, name, exclude=exclude_cases):
    filtered = [r for r in results if r['case_idx'] not in exclude]
    
    total = len(filtered)
    correct = sum(1 for r in filtered if r['correct'])
    lean_pass = sum(1 for r in filtered if r.get('lean_verification') and r['lean_verification'].get('success', False))
    
    # False positives: Lean pass AND wrong answer
    fp = []
    for r in filtered:
        lean_ok = r.get('lean_verification') and r['lean_verification'].get('success', False)
        if lean_ok and not r['correct']:
            fp.append({
                'case': r['case_idx'],
                'pred': r['prediction'],
                'gt': r['ground_truth']
            })
    
    return {
        'name': name,
        'total': total,
        'accuracy': f"{correct}/{total} ({100*correct/total:.1f}%)",
        'lean_pass': f"{lean_pass}/{total} ({100*lean_pass/total:.1f}%)",
        'false_positives': len(fp),
        'fp_rate': f"{100*len(fp)/total:.1f}%",
        'fp_details': fp
    }

In [None]:
results = [
    analyze(baseline, 'Baseline'),
    analyze(bidir_true, 'bidir_true'),
    analyze(bidir_false, 'bidir_false')
]

summary_df = pd.DataFrame([{
    'Condition': r['name'],
    'Accuracy': r['accuracy'],
    'Lean Pass': r['lean_pass'],
    'False Positives': r['false_positives'],
    'FP Rate': r['fp_rate']
} for r in results])

summary_df

## False Positives by Error Type

- **Gaming**: Proved wrong answer (True when gt=False/Uncertain, or False when gt=True/Uncertain)
- **Conservative**: Said Failure/Uncertain when could have proved

In [None]:
def categorize_fps(fp_details, condition):
    gaming = []
    conservative = []
    
    for fp in fp_details:
        pred, gt = fp['pred'], fp['gt']
        
        if condition == 'Baseline':
            if pred == 'True' and gt != 'True':
                gaming.append(fp)
            else:
                conservative.append(fp)
        elif condition == 'bidir_true':
            if pred == 'True':
                gaming.append(fp)
            else:  # Failure when gt=True
                conservative.append(fp)
        elif condition == 'bidir_false':
            if pred == 'False':
                gaming.append(fp)
            else:  # Failure when gt=False
                conservative.append(fp)
    
    return gaming, conservative

for r in results:
    gaming, conservative = categorize_fps(r['fp_details'], r['name'])
    print(f"\n{r['name']}:")
    print(f"  Gaming: {len(gaming)}")
    print(f"  Conservative: {len(conservative)}")

## Detailed False Positives

In [None]:
print("=" * 60)
print("BASELINE FALSE POSITIVES")
print("=" * 60)
for fp in sorted(results[0]['fp_details'], key=lambda x: x['case']):
    error_type = "GAMING" if fp['pred'] == 'True' and fp['gt'] != 'True' else "CONSERVATIVE"
    print(f"Case {fp['case']:3d}: {fp['pred']:>10} -> {fp['gt']:<10} [{error_type}]")

In [None]:
print("=" * 60)
print("BIDIR_TRUE FALSE POSITIVES")
print("=" * 60)
for fp in sorted(results[1]['fp_details'], key=lambda x: x['case']):
    error_type = "GAMING" if fp['pred'] == 'True' else "CONSERVATIVE"
    print(f"Case {fp['case']:3d}: {fp['pred']:>10} -> {fp['gt']:<10} [{error_type}]")

In [None]:
print("=" * 60)
print("BIDIR_FALSE FALSE POSITIVES")
print("=" * 60)
for fp in sorted(results[2]['fp_details'], key=lambda x: x['case']):
    error_type = "GAMING" if fp['pred'] == 'False' else "CONSERVATIVE"
    print(f"Case {fp['case']:3d}: {fp['pred']:>10} -> {fp['gt']:<10} [{error_type}]")

## Gaming Cases Comparison

Cases where model "proved" wrong answer across conditions

In [None]:
# Find gaming cases for each condition
baseline_gaming = {fp['case'] for fp in results[0]['fp_details'] if fp['pred'] == 'True' and fp['gt'] != 'True'}
bidir_true_gaming = {fp['case'] for fp in results[1]['fp_details'] if fp['pred'] == 'True'}
bidir_false_gaming = {fp['case'] for fp in results[2]['fp_details'] if fp['pred'] == 'False'}

print("Gaming cases:")
print(f"  Baseline:    {sorted(baseline_gaming)}")
print(f"  bidir_true:  {sorted(bidir_true_gaming)}")
print(f"  bidir_false: {sorted(bidir_false_gaming)}")
print()
print(f"Overlap (baseline & bidir_true): {sorted(baseline_gaming & bidir_true_gaming)}")

In [None]:
# Load FOLIO data for ground truth
with open('../data/folio/original/folio-validation.json') as f:
    folio_data = json.load(f)

# Gaming cases from bidir_true
gaming_cases = [41, 70, 83, 89, 202]

# Load as dict for easy lookup
baseline_dict = {r['case_idx']: r for r in baseline}
bidir_true_dict = {r['case_idx']: r for r in bidir_true}
bidir_false_dict = {r['case_idx']: r for r in bidir_false}

print("=" * 70)
print("GAMING CASES COMPARISON (bidir_true gaming cases)")
print("=" * 70)
print(f"{'Case':<6} {'GT':<12} {'Baseline':<12} {'bidir_true':<12} {'bidir_false':<12} {'Story'}")
print("-" * 70)

for idx in gaming_cases:
    gt = folio_data[idx]['label']
    story = folio_data[idx].get('story_id', '?')
    
    bl = baseline_dict[idx]['prediction']
    bt = bidir_true_dict[idx]['prediction']
    bf = bidir_false_dict[idx]['prediction']
    
    bl_mark = "ok" if baseline_dict[idx]['correct'] else "X"
    bt_mark = "ok" if bidir_true_dict[idx]['correct'] else "X"
    bf_mark = "ok" if bidir_false_dict[idx]['correct'] else "X"
    
    print(f"{idx:<6} {gt:<12} {bl+' '+bl_mark:<12} {bt+' '+bt_mark:<12} {bf+' '+bf_mark:<12} {story}")

## Gaming Cases Analysis

Analyzing each gaming case to determine if it's true gaming or dataset issue.

In [None]:
import re

def show_case(idx):
    print("=" * 70)
    print(f"CASE {idx}: PROBLEM")
    print("=" * 70)
    print(f"Premises:\n{folio_data[idx]['premises']}\n")
    print(f"Conclusion: {folio_data[idx]['conclusion']}")
    print(f"Ground Truth: {folio_data[idx]['label']}")

def show_lean(idx, condition='bidir_true'):
    path = f'../results/simplelean/deepseek-r1_folio_{condition}/responses/case_{idx}.txt'
    try:
        with open(path) as f:
            content = f.read()
        lean_match = re.search(r'<lean>(.*?)</lean>', content, re.DOTALL)
        if not lean_match:
            lean_match = re.search(r'```lean4?\n(.*?)```', content, re.DOTALL)
        print("\n" + "=" * 70)
        print(f"CASE {idx}: {condition} LEAN CODE")
        print("=" * 70)
        print(lean_match.group(1).strip() if lean_match else "No Lean code found")
    except FileNotFoundError:
        print(f"File not found: {path}")

### Case 41 (GT: False, Model: True)

In [None]:
show_case(41)
show_lean(41)

### Case 70 (GT: Uncertain, Model: True)

In [None]:
show_case(70)
show_lean(70)

### Case 83 (GT: False, Model: True)

In [None]:
show_case(83)
show_lean(83)

### Case 89 (GT: Uncertain, Model: True)

In [None]:
show_case(89)
show_lean(89)

### Case 202 (GT: Uncertain, Model: True)

In [None]:
show_case(202)
show_lean(202)

## Summary: Gaming Cases Analysis

**All gaming cases across conditions:**

| Case | GT | GPT-5 BL | GPT-5 BT | DS-R1 BL | DS-R1 BT | Issue | Verdict |
|------|-----|----------|----------|----------|----------|-------|---------|
| 41 | False | GAME | ok | GAME | GAME | Valid proof via contradiction | **Model correct** |
| 70 | Uncertain | GAME | GAME | ok | GAME | Invented axiom (Stock subset) | **GAMING** |
| 83 | False | GAME | GAME | GAME | GAME | Vacuous truth | **Debatable** |
| 89 | Uncertain | GAME | GAME | GAME | GAME | Valid chain reasoning | **Model correct** |
| 202 | Uncertain | GAME | GAME | GAME | GAME | Valid inference | **Model correct** |

**Observations:**
- GPT-5 bidir_true fixed Case 41 (5→4 gaming)
- DeepSeek-R1 baseline got Case 70 correct, but bidir_true introduced it (4→5)
- **bidir_false: 0 gaming for both models**

### Conclusion
- **1 true gaming case**: Case 70 (invented axiom)
- **1 debatable case**: Case 83 (vacuous truth - classical logic says True)
- **3 dataset issues**: Cases 41, 89, 202 (model's reasoning appears valid)