# Variable Reflection: Comprehensive Analysis

**Research Question**: Does prompted reflection improve or impair moral reasoning in LLMs?

This notebook analyzes the relationship between reflection levels, extended thinking, and moral judgment accuracy across three benchmarks.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12

# Color palettes
COLORS = {
    'no_thinking': '#2ecc71',  # Green
    'with_thinking': '#e74c3c',  # Red
    'ethics': '#3498db',  # Blue
    'moralchoice': '#9b59b6',  # Purple
    'morables': '#f39c12',  # Orange
}

SUBSCALE_COLORS = {
    'commonsense': '#1abc9c',
    'deontology': '#3498db',
    'virtue': '#9b59b6',
}

AMBIGUITY_COLORS = {
    'low': '#27ae60',
    'high': '#c0392b',
}

# Create output directory
Path('../outputs/figures').mkdir(parents=True, exist_ok=True)

In [None]:
# Load data
results_dir = Path("../results/raw")

ethics_df = pd.read_csv(results_dir / "ethics_checkpoint.csv") if (results_dir / "ethics_checkpoint.csv").exists() else None
mc_df = pd.read_csv(results_dir / "moralchoice_checkpoint.csv") if (results_dir / "moralchoice_checkpoint.csv").exists() else None
morables_df = pd.read_csv(results_dir / "morables_checkpoint.csv") if (results_dir / "morables_checkpoint.csv").exists() else None

print("Data Loaded:")
print(f"  ETHICS:      {len(ethics_df) if ethics_df is not None else 0:,} observations")
print(f"  MoralChoice: {len(mc_df) if mc_df is not None else 0:,} observations")
print(f"  MORABLES:    {len(morables_df) if morables_df is not None else 0:,} observations")

---
## Part 1: The Core Finding — Reflection Decreases Performance

This section demonstrates the primary result: higher reflection levels lead to lower accuracy.

In [None]:
# Figure 1: Accuracy by Reflection Level (Main Result)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, (df, name, color) in zip(axes, [(ethics_df, "ETHICS", COLORS['ethics']), 
                                         (morables_df, "MORABLES", COLORS['morables'])]):
    if df is None or 'correct' not in df.columns:
        ax.text(0.5, 0.5, f"No data for {name}", ha='center', va='center', transform=ax.transAxes)
        continue
    
    valid = df[df['correct'].notna()]
    
    # Calculate accuracy by level
    acc_by_level = valid.groupby('level')['correct'].agg(['mean', 'std', 'count']).reset_index()
    acc_by_level['se'] = acc_by_level['std'] / np.sqrt(acc_by_level['count'])
    acc_by_level['mean_pct'] = acc_by_level['mean'] * 100
    acc_by_level['se_pct'] = acc_by_level['se'] * 100
    
    # Plot with error bars
    ax.errorbar(acc_by_level['level'], acc_by_level['mean_pct'], 
                yerr=acc_by_level['se_pct'] * 1.96,  # 95% CI
                marker='o', markersize=12, linewidth=3, capsize=5,
                color=color, markerfacecolor='white', markeredgewidth=2)
    
    # Add trend line
    z = np.polyfit(acc_by_level['level'], acc_by_level['mean_pct'], 1)
    p = np.poly1d(z)
    ax.plot(acc_by_level['level'], p(acc_by_level['level']), '--', 
            color=color, alpha=0.5, linewidth=2, label=f'Trend: {z[0]:+.1f}% per level')
    
    # Calculate correlation
    r, p_val = stats.pearsonr(acc_by_level['level'], acc_by_level['mean_pct'])
    
    ax.set_xlabel('Reflection Level', fontsize=12)
    ax.set_ylabel('Accuracy (%)', fontsize=12)
    ax.set_title(f'{name}: Accuracy Decreases with Reflection\n(r = {r:.2f}, p = {p_val:.3f})', fontsize=14)
    ax.set_xticks(acc_by_level['level'])
    ax.set_xticklabels(['L0\n(Direct)', 'L2\n(CoT)', 'L4\n(Devil\'s\nAdvocate)', 'L5\n(Two-Pass)'])
    ax.set_ylim(50, 100)
    ax.legend(loc='upper right')
    
    # Add annotation for drop
    l0_acc = acc_by_level[acc_by_level['level'] == 0]['mean_pct'].values[0]
    l5_acc = acc_by_level[acc_by_level['level'] == acc_by_level['level'].max()]['mean_pct'].values[0]
    ax.annotate(f'Δ = {l5_acc - l0_acc:+.1f}%', 
                xy=(0.95, 0.05), xycoords='axes fraction',
                fontsize=14, fontweight='bold', color='red',
                ha='right', va='bottom',
                bbox=dict(boxstyle='round', facecolor='white', edgecolor='red', alpha=0.8))

plt.suptitle('Primary Finding: More Reflection → Lower Accuracy', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/figures/fig1_reflection_decreases_accuracy.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Figure 2: Thinking × Level Interaction

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, (df, name) in zip(axes, [(ethics_df, "ETHICS"), (morables_df, "MORABLES")]):
    if df is None or 'correct' not in df.columns:
        continue
    
    valid = df[df['correct'].notna()]
    
    for thinking, color, label in [(False, COLORS['no_thinking'], 'No Extended Thinking'),
                                    (True, COLORS['with_thinking'], 'With Extended Thinking')]:
        subset = valid[valid['thinking'] == thinking]
        acc = subset.groupby('level')['correct'].mean() * 100
        
        ax.plot(acc.index, acc.values, marker='o', markersize=10, 
                linewidth=2.5, color=color, label=label)
    
    ax.set_xlabel('Reflection Level')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title(f'{name}: Extended Thinking Effect by Level')
    ax.set_xticks(valid['level'].unique())
    ax.legend(loc='lower left')
    ax.set_ylim(50, 100)
    
    # Highlight L0 difference
    l0_no = valid[(valid['level'] == 0) & (valid['thinking'] == False)]['correct'].mean() * 100
    l0_yes = valid[(valid['level'] == 0) & (valid['thinking'] == True)]['correct'].mean() * 100
    
    ax.annotate(f'L0: Thinking hurts\nby {l0_no - l0_yes:.1f}%',
                xy=(0, l0_no), xytext=(0.5, l0_no + 5),
                arrowprops=dict(arrowstyle='->', color='gray'),
                fontsize=10, ha='center')

plt.suptitle('Extended Thinking Interaction with Reflection Level', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/figures/fig2_thinking_interaction.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Figure 3: Combined Heatmap - All Conditions

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for ax, (df, name) in zip(axes, [(ethics_df, "ETHICS"), (mc_df, "MoralChoice"), (morables_df, "MORABLES")]):
    if df is None:
        ax.text(0.5, 0.5, f"No data", ha='center', va='center', transform=ax.transAxes)
        ax.set_title(name)
        continue
    
    if 'correct' in df.columns:
        valid = df[df['correct'].notna()]
        metric = valid.groupby(['level', 'thinking'])['correct'].mean() * 100
        metric_name = 'Accuracy (%)'
    else:
        # For MoralChoice, use extraction rate or confidence
        valid = df[df['extracted_answer'].notna()]
        if 'confidence' in df.columns:
            metric = valid.groupby(['level', 'thinking'])['confidence'].mean()
            metric_name = 'Confidence'
        else:
            continue
    
    pivot = metric.unstack()
    pivot.columns = ['No Thinking', 'With Thinking']
    
    sns.heatmap(pivot, annot=True, fmt='.1f', cmap='RdYlGn', 
                center=pivot.values.mean(), ax=ax,
                cbar_kws={'label': metric_name})
    ax.set_title(f'{name}')
    ax.set_xlabel('Thinking Mode')
    ax.set_ylabel('Reflection Level')

plt.suptitle('Performance Across All Conditions', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/figures/fig3_heatmap_all_conditions.png', dpi=300, bbox_inches='tight')
plt.show()

---
## Part 2: Subcategory Analysis

Analyzing how the reflection effect varies across moral frameworks (ETHICS subscales) and ambiguity levels (MoralChoice).

In [None]:
# Figure 4: ETHICS Accuracy by Subscale

if ethics_df is not None and 'subscale' in ethics_df.columns:
    fig, axes = plt.subplots(1, 3, figsize=(16, 5))
    
    valid = ethics_df[ethics_df['correct'].notna()]
    
    for ax, subscale in zip(axes, ['commonsense', 'deontology', 'virtue']):
        subset = valid[valid['subscale'] == subscale]
        
        for thinking, color, label in [(False, COLORS['no_thinking'], 'No Thinking'),
                                        (True, COLORS['with_thinking'], 'With Thinking')]:
            sub_think = subset[subset['thinking'] == thinking]
            acc = sub_think.groupby('level')['correct'].mean() * 100
            ax.plot(acc.index, acc.values, marker='o', markersize=8, 
                    linewidth=2, color=color, label=label)
        
        # Overall accuracy for this subscale
        overall = subset['correct'].mean() * 100
        
        ax.set_xlabel('Reflection Level')
        ax.set_ylabel('Accuracy (%)')
        ax.set_title(f'{subscale.capitalize()}\n(Overall: {overall:.1f}%)')
        ax.set_xticks(valid['level'].unique())
        ax.legend(loc='lower left', fontsize=9)
        ax.set_ylim(40, 100)
        ax.axhline(y=50, color='gray', linestyle='--', alpha=0.3, label='Chance')
    
    plt.suptitle('ETHICS: Reflection Effect by Moral Framework', fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig('../outputs/figures/fig4_ethics_by_subscale.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("ETHICS data with subscale not available")

In [None]:
# Figure 5: ETHICS Subscale Comparison (Bar Chart)

if ethics_df is not None and 'subscale' in ethics_df.columns:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    valid = ethics_df[ethics_df['correct'].notna()]
    
    # Aggregate by subscale and level
    agg = valid.groupby(['subscale', 'level'])['correct'].mean().reset_index()
    agg['correct'] = agg['correct'] * 100
    
    # Create grouped bar chart
    subscales = ['commonsense', 'deontology', 'virtue']
    levels = sorted(agg['level'].unique())
    x = np.arange(len(subscales))
    width = 0.2
    
    for i, level in enumerate(levels):
        level_data = agg[agg['level'] == level]
        values = [level_data[level_data['subscale'] == s]['correct'].values[0] for s in subscales]
        bars = ax.bar(x + i*width, values, width, label=f'Level {level}')
        
        # Add value labels
        for bar, val in zip(bars, values):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                    f'{val:.0f}', ha='center', va='bottom', fontsize=8)
    
    ax.set_xlabel('Moral Framework')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title('ETHICS: Accuracy by Subscale and Reflection Level')
    ax.set_xticks(x + width * 1.5)
    ax.set_xticklabels([s.capitalize() for s in subscales])
    ax.legend(title='Reflection Level')
    ax.set_ylim(0, 100)
    
    plt.tight_layout()
    plt.savefig('../outputs/figures/fig5_ethics_subscale_bars.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# Figure 6: MoralChoice by Ambiguity

if mc_df is not None and 'ambiguity' in mc_df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    valid = mc_df[mc_df['extracted_answer'].notna()]
    
    # Left: Choice distribution by ambiguity and level
    ax = axes[0]
    
    for amb, color in AMBIGUITY_COLORS.items():
        subset = valid[valid['ambiguity'] == amb]
        pct_a = subset.groupby('level').apply(lambda x: (x['extracted_answer'] == 'A').mean() * 100)
        ax.plot(pct_a.index, pct_a.values, marker='o', markersize=10,
                linewidth=2.5, color=color, label=f'{amb.capitalize()} Ambiguity')
    
    ax.set_xlabel('Reflection Level')
    ax.set_ylabel('% Choosing Option A')
    ax.set_title('MoralChoice: Option Preference by Ambiguity')
    ax.legend()
    ax.set_ylim(50, 100)
    ax.axhline(y=50, color='gray', linestyle='--', alpha=0.5)
    
    # Right: Confidence by ambiguity
    ax = axes[1]
    
    if 'confidence' in valid.columns:
        conf_valid = valid[valid['confidence'].notna()]
        
        for amb, color in AMBIGUITY_COLORS.items():
            subset = conf_valid[conf_valid['ambiguity'] == amb]
            conf = subset.groupby('level')['confidence'].mean()
            ax.plot(conf.index, conf.values, marker='s', markersize=10,
                    linewidth=2.5, color=color, label=f'{amb.capitalize()} Ambiguity')
        
        ax.set_xlabel('Reflection Level')
        ax.set_ylabel('Mean Confidence')
        ax.set_title('MoralChoice: Confidence by Ambiguity')
        ax.legend()
        ax.set_ylim(50, 100)
    
    plt.suptitle('MoralChoice: Ambiguity Effects', fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig('../outputs/figures/fig6_moralchoice_ambiguity.png', dpi=300, bbox_inches='tight')
    plt.show()

---
## Part 3: Confidence Analysis

In [None]:
# Figure 7: Confidence Calibration Curves

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, (df, name, color) in zip(axes, [(ethics_df, "ETHICS", COLORS['ethics']), 
                                         (morables_df, "MORABLES", COLORS['morables'])]):
    if df is None or 'confidence' not in df.columns or 'correct' not in df.columns:
        ax.text(0.5, 0.5, f"No data for {name}", ha='center', va='center', transform=ax.transAxes)
        continue
    
    valid = df[df['confidence'].notna() & df['correct'].notna()].copy()
    
    # Bin confidence
    bins = [0, 20, 40, 60, 80, 100]
    labels = ['0-20', '21-40', '41-60', '61-80', '81-100']
    valid['conf_bin'] = pd.cut(valid['confidence'], bins=bins, labels=labels)
    
    calib = valid.groupby('conf_bin', observed=True).agg(
        accuracy=('correct', 'mean'),
        count=('correct', 'count')
    ).reset_index()
    calib['accuracy_pct'] = calib['accuracy'] * 100
    
    # Expected (perfect calibration)
    midpoints = {'0-20': 10, '21-40': 30, '41-60': 50, '61-80': 70, '81-100': 90}
    calib['expected'] = calib['conf_bin'].map(midpoints)
    
    x = range(len(calib))
    width = 0.35
    
    ax.bar([i - width/2 for i in x], calib['accuracy_pct'], width, 
           label='Actual Accuracy', color=color, alpha=0.7)
    ax.bar([i + width/2 for i in x], calib['expected'], width,
           label='Perfect Calibration', color='gray', alpha=0.5)
    
    ax.set_xticks(x)
    ax.set_xticklabels(calib['conf_bin'])
    ax.set_xlabel('Confidence Range')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title(f'{name}: Confidence Calibration')
    ax.legend()
    ax.set_ylim(0, 105)
    
    # Add count annotations
    for i, (_, row) in enumerate(calib.iterrows()):
        ax.text(i - width/2, row['accuracy_pct'] + 2, f'n={row["count"]}', 
                ha='center', fontsize=8, color='gray')

plt.suptitle('Confidence Calibration: Model is Overconfident', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/figures/fig7_confidence_calibration.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Figure 8: Confidence by Subscale (ETHICS)

if ethics_df is not None and 'subscale' in ethics_df.columns and 'confidence' in ethics_df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    valid = ethics_df[ethics_df['confidence'].notna() & ethics_df['correct'].notna()]
    
    # Left: Mean confidence by subscale
    ax = axes[0]
    conf_by_sub = valid.groupby('subscale')['confidence'].mean()
    acc_by_sub = valid.groupby('subscale')['correct'].mean() * 100
    
    x = np.arange(len(conf_by_sub))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, conf_by_sub.values, width, label='Confidence', color='steelblue')
    bars2 = ax.bar(x + width/2, acc_by_sub.values, width, label='Accuracy', color='coral')
    
    ax.set_xlabel('Subscale')
    ax.set_ylabel('Score')
    ax.set_title('Confidence vs Accuracy by Subscale')
    ax.set_xticks(x)
    ax.set_xticklabels([s.capitalize() for s in conf_by_sub.index])
    ax.legend()
    
    # Add gap annotations
    for i, (sub, conf) in enumerate(conf_by_sub.items()):
        acc = acc_by_sub[sub]
        gap = conf - acc
        ax.annotate(f'+{gap:.0f}', xy=(i, max(conf, acc) + 2), 
                    ha='center', fontsize=10, color='red' if gap > 5 else 'gray')
    
    # Right: Confidence when correct vs incorrect
    ax = axes[1]
    
    conf_correct = valid.groupby(['subscale', 'correct'])['confidence'].mean().unstack()
    conf_correct.columns = ['Incorrect', 'Correct']
    
    conf_correct.plot(kind='bar', ax=ax, color=['#e74c3c', '#2ecc71'])
    ax.set_xlabel('Subscale')
    ax.set_ylabel('Mean Confidence')
    ax.set_title('Confidence When Correct vs Incorrect')
    ax.set_xticklabels([s.capitalize() for s in conf_correct.index], rotation=0)
    ax.legend(title='Answer')
    
    plt.suptitle('ETHICS: Confidence Analysis by Subscale', fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig('../outputs/figures/fig8_confidence_by_subscale.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# Figure 9: Confidence by Reflection Level

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, (df, name) in zip(axes, [(ethics_df, "ETHICS"), (morables_df, "MORABLES")]):
    if df is None or 'confidence' not in df.columns:
        continue
    
    valid = df[df['confidence'].notna()]
    
    # Violin plot of confidence by level
    levels = sorted(valid['level'].unique())
    data = [valid[valid['level'] == l]['confidence'].values for l in levels]
    
    parts = ax.violinplot(data, positions=levels, showmeans=True, showmedians=True)
    
    # Color the violins
    for pc in parts['bodies']:
        pc.set_facecolor('steelblue')
        pc.set_alpha(0.6)
    
    ax.set_xlabel('Reflection Level')
    ax.set_ylabel('Confidence')
    ax.set_title(f'{name}: Confidence Distribution by Level')
    ax.set_xticks(levels)
    ax.set_ylim(0, 105)

plt.suptitle('Confidence Distributions Across Reflection Levels', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/figures/fig9_confidence_by_level.png', dpi=300, bbox_inches='tight')
plt.show()

---
## Part 4: Summary Statistics

In [None]:
# Table 1: Summary by Benchmark and Condition

summary_data = []

for df, name in [(ethics_df, "ETHICS"), (mc_df, "MoralChoice"), (morables_df, "MORABLES")]:
    if df is None:
        continue
    
    for level in sorted(df['level'].unique()):
        for thinking in [False, True]:
            subset = df[(df['level'] == level) & (df['thinking'] == thinking)]
            
            row = {
                'Benchmark': name,
                'Level': level,
                'Thinking': 'On' if thinking else 'Off',
                'N': len(subset),
            }
            
            if 'extracted_answer' in subset.columns:
                row['Extraction %'] = round(subset['extracted_answer'].notna().mean() * 100, 1)
            
            if 'correct' in subset.columns:
                valid = subset[subset['correct'].notna()]
                if len(valid) > 0:
                    row['Accuracy %'] = round(valid['correct'].mean() * 100, 1)
            
            if 'confidence' in subset.columns:
                conf = subset['confidence'].dropna()
                if len(conf) > 0:
                    row['Mean Conf'] = round(conf.mean(), 1)
                    row['Conf Std'] = round(conf.std(), 1)
            
            summary_data.append(row)

summary_df = pd.DataFrame(summary_data)
print("Summary Statistics:")
display(summary_df)

# Save
summary_df.to_csv('../outputs/summary_by_condition.csv', index=False)

In [None]:
# Table 2: ETHICS by Subscale

if ethics_df is not None and 'subscale' in ethics_df.columns:
    subscale_summary = []
    
    valid = ethics_df[ethics_df['correct'].notna()]
    
    for subscale in valid['subscale'].unique():
        for level in sorted(valid['level'].unique()):
            subset = valid[(valid['subscale'] == subscale) & (valid['level'] == level)]
            
            row = {
                'Subscale': subscale.capitalize(),
                'Level': level,
                'N': len(subset),
                'Accuracy %': round(subset['correct'].mean() * 100, 1),
            }
            
            if 'confidence' in subset.columns:
                conf = subset['confidence'].dropna()
                if len(conf) > 0:
                    row['Mean Conf'] = round(conf.mean(), 1)
            
            subscale_summary.append(row)
    
    subscale_df = pd.DataFrame(subscale_summary)
    
    print("\nETHICS by Subscale:")
    pivot = subscale_df.pivot(index='Subscale', columns='Level', values='Accuracy %')
    display(pivot)
    
    subscale_df.to_csv('../outputs/ethics_by_subscale.csv', index=False)

In [None]:
# Table 3: Key Findings Summary

print("="*70)
print("KEY FINDINGS SUMMARY")
print("="*70)

findings = []

for df, name in [(ethics_df, "ETHICS"), (morables_df, "MORABLES")]:
    if df is None or 'correct' not in df.columns:
        continue
    
    valid = df[df['correct'].notna()]
    
    # Accuracy at L0 vs L5
    l0 = valid[valid['level'] == 0]['correct'].mean() * 100
    l5 = valid[valid['level'] == valid['level'].max()]['correct'].mean() * 100
    
    # Best and worst conditions
    acc_by_cond = valid.groupby(['level', 'thinking'])['correct'].mean() * 100
    best = acc_by_cond.idxmax()
    worst = acc_by_cond.idxmin()
    
    # Thinking effect at L0
    l0_no_think = valid[(valid['level'] == 0) & (valid['thinking'] == False)]['correct'].mean() * 100
    l0_with_think = valid[(valid['level'] == 0) & (valid['thinking'] == True)]['correct'].mean() * 100
    
    print(f"\n{name}:")
    print(f"  Reflection effect: L0 ({l0:.1f}%) → Max Level ({l5:.1f}%) = {l5-l0:+.1f}%")
    print(f"  Best condition: Level {best[0]}, Thinking {'ON' if best[1] else 'OFF'} ({acc_by_cond[best]:.1f}%)")
    print(f"  Worst condition: Level {worst[0]}, Thinking {'ON' if worst[1] else 'OFF'} ({acc_by_cond[worst]:.1f}%)")
    print(f"  L0 Thinking effect: {l0_with_think - l0_no_think:+.1f}%")
    
    if 'confidence' in valid.columns:
        mean_conf = valid['confidence'].mean()
        mean_acc = valid['correct'].mean() * 100
        print(f"  Calibration gap: Confidence ({mean_conf:.1f}) - Accuracy ({mean_acc:.1f}) = {mean_conf - mean_acc:+.1f}")

---
## Part 5: Statistical Tests

In [None]:
# Statistical tests for key findings

print("="*70)
print("STATISTICAL TESTS")
print("="*70)

for df, name in [(ethics_df, "ETHICS"), (morables_df, "MORABLES")]:
    if df is None or 'correct' not in df.columns:
        continue
    
    valid = df[df['correct'].notna()]
    
    print(f"\n{name}:")
    
    # 1. Correlation between level and accuracy
    acc_by_level = valid.groupby('level')['correct'].mean()
    r, p = stats.pearsonr(acc_by_level.index, acc_by_level.values)
    print(f"  Level-Accuracy correlation: r={r:.3f}, p={p:.4f}")
    
    # 2. L0 vs L5 comparison
    l0_correct = valid[valid['level'] == 0]['correct']
    l5_correct = valid[valid['level'] == valid['level'].max()]['correct']
    
    # Chi-square test
    contingency = pd.crosstab(valid['level'].isin([0]), valid['correct'])
    if len(contingency) > 1:
        chi2, p, dof, expected = stats.chi2_contingency(contingency)
        print(f"  L0 vs Others (chi-square): χ²={chi2:.2f}, p={p:.4f}")
    
    # 3. Thinking effect at L0
    l0_no_think = valid[(valid['level'] == 0) & (valid['thinking'] == False)]['correct']
    l0_with_think = valid[(valid['level'] == 0) & (valid['thinking'] == True)]['correct']
    
    if len(l0_no_think) > 0 and len(l0_with_think) > 0:
        # Proportion z-test
        p1 = l0_no_think.mean()
        p2 = l0_with_think.mean()
        n1, n2 = len(l0_no_think), len(l0_with_think)
        p_pooled = (p1*n1 + p2*n2) / (n1 + n2)
        se = np.sqrt(p_pooled * (1-p_pooled) * (1/n1 + 1/n2))
        z = (p1 - p2) / se if se > 0 else 0
        p_val = 2 * (1 - stats.norm.cdf(abs(z)))
        print(f"  L0 Thinking effect (z-test): z={z:.2f}, p={p_val:.4f}")

In [None]:
# Save all figures list
print("\n" + "="*70)
print("FIGURES GENERATED")
print("="*70)

figures = [
    ("fig1_reflection_decreases_accuracy.png", "Primary finding: accuracy decreases with reflection level"),
    ("fig2_thinking_interaction.png", "Extended thinking interaction with reflection level"),
    ("fig3_heatmap_all_conditions.png", "Heatmap of performance across all conditions"),
    ("fig4_ethics_by_subscale.png", "ETHICS accuracy by moral framework"),
    ("fig5_ethics_subscale_bars.png", "ETHICS subscale comparison bar chart"),
    ("fig6_moralchoice_ambiguity.png", "MoralChoice analysis by ambiguity level"),
    ("fig7_confidence_calibration.png", "Confidence calibration curves"),
    ("fig8_confidence_by_subscale.png", "ETHICS confidence analysis by subscale"),
    ("fig9_confidence_by_level.png", "Confidence distributions by reflection level"),
]

for fname, desc in figures:
    path = Path(f'../outputs/figures/{fname}')
    status = "✓" if path.exists() else "✗"
    print(f"  {status} {fname}: {desc}")