# Consistency Analysis: Variance Across Runs

This notebook analyzes the consistency of model responses across multiple runs (N=3) for each condition.

**Key Questions:**
1. How consistent are answers across runs? (Same item, same condition, different runs)
2. Does reflection level affect response consistency?
3. How variable is confidence across runs?
4. Are certain subscales/categories more consistent than others?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12

# Colors
LEVEL_COLORS = {0: '#2ecc71', 2: '#3498db', 4: '#9b59b6', 5: '#e74c3c'}
THINKING_COLORS = {False: '#27ae60', True: '#c0392b'}

# Create output directory
Path('../outputs/figures').mkdir(parents=True, exist_ok=True)

In [None]:
# Load data
results_dir = Path("../results/raw")

ethics_df = pd.read_csv(results_dir / "ethics_checkpoint.csv") if (results_dir / "ethics_checkpoint.csv").exists() else None
mc_df = pd.read_csv(results_dir / "moralchoice_checkpoint.csv") if (results_dir / "moralchoice_checkpoint.csv").exists() else None
morables_df = pd.read_csv(results_dir / "morables_checkpoint.csv") if (results_dir / "morables_checkpoint.csv").exists() else None

print("Data Loaded:")
for name, df in [('ETHICS', ethics_df), ('MoralChoice', mc_df), ('MORABLES', morables_df)]:
    if df is not None:
        n_items = df['item_id'].nunique()
        n_runs = df['run'].nunique()
        print(f"  {name}: {len(df):,} obs, {n_items} items, {n_runs} runs")

---
## Part 1: Answer Consistency Across Runs

For each item × level × thinking condition, how often does the model give the **same answer** across all 3 runs?

In [None]:
def calculate_consistency(df):
    """Calculate % of items with identical answers across all runs."""
    if df is None or 'extracted_answer' not in df.columns:
        return None
    
    valid = df[df['extracted_answer'].notna()]
    
    # Group by item, level, thinking and count unique answers
    consistency = valid.groupby(['item_id', 'level', 'thinking']).agg(
        n_unique_answers=('extracted_answer', 'nunique'),
        n_runs=('run', 'count')
    ).reset_index()
    
    # Consistent = only 1 unique answer across all runs
    consistency['is_consistent'] = consistency['n_unique_answers'] == 1
    
    return consistency

# Calculate for each benchmark
ethics_cons = calculate_consistency(ethics_df)
mc_cons = calculate_consistency(mc_df)
morables_cons = calculate_consistency(morables_df)

In [None]:
# Figure 1: Answer Consistency by Reflection Level

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for ax, (cons, name) in zip(axes, [(ethics_cons, 'ETHICS'), (mc_cons, 'MoralChoice'), (morables_cons, 'MORABLES')]):
    if cons is None:
        ax.text(0.5, 0.5, f'No data for {name}', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(name)
        continue
    
    # Calculate consistency rate by level
    cons_by_level = cons.groupby('level')['is_consistent'].mean() * 100
    
    bars = ax.bar(cons_by_level.index, cons_by_level.values, 
                  color=[LEVEL_COLORS.get(l, 'gray') for l in cons_by_level.index],
                  edgecolor='black', linewidth=1.2)
    
    # Add value labels
    for bar, val in zip(bars, cons_by_level.values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                f'{val:.1f}%', ha='center', va='bottom', fontsize=11, fontweight='bold')
    
    ax.set_xlabel('Reflection Level')
    ax.set_ylabel('Consistency Rate (%)')
    ax.set_title(f'{name}')
    ax.set_ylim(0, 105)
    ax.set_xticks(cons_by_level.index)
    ax.axhline(y=100, color='gray', linestyle='--', alpha=0.3)

plt.suptitle('Answer Consistency Across Runs by Reflection Level\n(% of items with identical answers in all 3 runs)', 
             fontsize=14, fontweight='bold', y=1.05)
plt.tight_layout()
plt.savefig('../outputs/figures/consistency_fig1_by_level.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Figure 2: Consistency by Level and Thinking Mode

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for ax, (cons, name) in zip(axes, [(ethics_cons, 'ETHICS'), (mc_cons, 'MoralChoice'), (morables_cons, 'MORABLES')]):
    if cons is None:
        ax.text(0.5, 0.5, f'No data', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(name)
        continue
    
    levels = sorted(cons['level'].unique())
    x = np.arange(len(levels))
    width = 0.35
    
    for i, (thinking, color, label) in enumerate([(False, THINKING_COLORS[False], 'No Thinking'),
                                                   (True, THINKING_COLORS[True], 'With Thinking')]):
        subset = cons[cons['thinking'] == thinking]
        rates = [subset[subset['level'] == l]['is_consistent'].mean() * 100 for l in levels]
        
        bars = ax.bar(x + (i - 0.5) * width, rates, width, label=label, color=color, alpha=0.8)
        
        # Value labels
        for bar, val in zip(bars, rates):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                    f'{val:.0f}', ha='center', va='bottom', fontsize=9)
    
    ax.set_xlabel('Reflection Level')
    ax.set_ylabel('Consistency Rate (%)')
    ax.set_title(f'{name}')
    ax.set_xticks(x)
    ax.set_xticklabels(levels)
    ax.set_ylim(0, 110)
    ax.legend(loc='lower right')

plt.suptitle('Answer Consistency: Thinking Mode × Reflection Level', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/figures/consistency_fig2_thinking_interaction.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Figure 3: Heatmap of Consistency Across All Conditions

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for ax, (cons, name) in zip(axes, [(ethics_cons, 'ETHICS'), (mc_cons, 'MoralChoice'), (morables_cons, 'MORABLES')]):
    if cons is None:
        ax.text(0.5, 0.5, f'No data', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(name)
        continue
    
    # Pivot for heatmap
    pivot = cons.groupby(['level', 'thinking'])['is_consistent'].mean().unstack() * 100
    pivot.columns = ['No Thinking', 'With Thinking']
    
    sns.heatmap(pivot, annot=True, fmt='.1f', cmap='RdYlGn', 
                center=80, vmin=50, vmax=100, ax=ax,
                cbar_kws={'label': 'Consistency %'})
    ax.set_title(f'{name}')
    ax.set_xlabel('Thinking Mode')
    ax.set_ylabel('Reflection Level')

plt.suptitle('Answer Consistency Heatmap (% identical across 3 runs)', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/figures/consistency_fig3_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

---
## Part 2: Accuracy Variance Across Runs

How much does accuracy vary across the 3 runs for each condition?

In [None]:
# Figure 4: Accuracy Variance by Run

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, (df, name) in zip(axes, [(ethics_df, 'ETHICS'), (morables_df, 'MORABLES')]):
    if df is None or 'correct' not in df.columns:
        ax.text(0.5, 0.5, f'No data for {name}', ha='center', va='center', transform=ax.transAxes)
        continue
    
    valid = df[df['correct'].notna()]
    
    # Calculate accuracy by level, thinking, and run
    acc_by_run = valid.groupby(['level', 'thinking', 'run'])['correct'].mean() * 100
    acc_by_run = acc_by_run.reset_index()
    
    # Plot each level
    levels = sorted(valid['level'].unique())
    
    for level in levels:
        level_data = acc_by_run[acc_by_run['level'] == level]
        
        # Aggregate across thinking for simplicity
        by_run = level_data.groupby('run')['correct'].mean()
        
        ax.plot(by_run.index, by_run.values, marker='o', markersize=8,
                linewidth=2, color=LEVEL_COLORS.get(level, 'gray'),
                label=f'Level {level}')
    
    ax.set_xlabel('Run Number')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title(f'{name}: Accuracy by Run')
    ax.set_xticks(valid['run'].unique())
    ax.legend(loc='lower right')
    ax.set_ylim(40, 100)

plt.suptitle('Accuracy Stability Across Runs', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/figures/consistency_fig4_accuracy_by_run.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Figure 5: Accuracy Standard Deviation by Condition

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, (df, name) in zip(axes, [(ethics_df, 'ETHICS'), (morables_df, 'MORABLES')]):
    if df is None or 'correct' not in df.columns:
        ax.text(0.5, 0.5, f'No data', ha='center', va='center', transform=ax.transAxes)
        continue
    
    valid = df[df['correct'].notna()]
    
    # Calculate per-item accuracy variance across runs
    item_variance = valid.groupby(['item_id', 'level', 'thinking']).agg(
        mean_correct=('correct', 'mean'),
        std_correct=('correct', 'std'),
        n_runs=('run', 'count')
    ).reset_index()
    
    # Mean std by level (items with variance > 0 means inconsistent)
    # For binary outcomes, std > 0 means not all runs agreed
    item_variance['has_variance'] = item_variance['std_correct'] > 0
    
    variance_rate = item_variance.groupby('level')['has_variance'].mean() * 100
    
    bars = ax.bar(variance_rate.index, variance_rate.values,
                  color=[LEVEL_COLORS.get(l, 'gray') for l in variance_rate.index],
                  edgecolor='black', linewidth=1.2)
    
    for bar, val in zip(bars, variance_rate.values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                f'{val:.1f}%', ha='center', va='bottom', fontsize=11, fontweight='bold')
    
    ax.set_xlabel('Reflection Level')
    ax.set_ylabel('% Items with Answer Variance')
    ax.set_title(f'{name}: Items with Inconsistent Answers Across Runs')
    ax.set_xticks(variance_rate.index)
    ax.set_ylim(0, max(variance_rate.values) * 1.3 if len(variance_rate) > 0 else 50)

plt.suptitle('Answer Variance Rate by Reflection Level\n(% of items where accuracy differed across runs)', 
             fontsize=14, fontweight='bold', y=1.05)
plt.tight_layout()
plt.savefig('../outputs/figures/consistency_fig5_variance_rate.png', dpi=300, bbox_inches='tight')
plt.show()

---
## Part 3: Confidence Variance Across Runs

In [None]:
# Figure 6: Confidence Variance by Reflection Level

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for ax, (df, name) in zip(axes, [(ethics_df, 'ETHICS'), (mc_df, 'MoralChoice'), (morables_df, 'MORABLES')]):
    if df is None or 'confidence' not in df.columns:
        ax.text(0.5, 0.5, f'No data', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(name)
        continue
    
    valid = df[df['confidence'].notna()]
    
    # Calculate per-item confidence std across runs
    conf_var = valid.groupby(['item_id', 'level', 'thinking']).agg(
        mean_conf=('confidence', 'mean'),
        std_conf=('confidence', 'std'),
        n_runs=('run', 'count')
    ).reset_index()
    
    # Fill NaN std (when only 1 run) with 0
    conf_var['std_conf'] = conf_var['std_conf'].fillna(0)
    
    # Box plot of std by level
    levels = sorted(conf_var['level'].unique())
    data = [conf_var[conf_var['level'] == l]['std_conf'].values for l in levels]
    
    bp = ax.boxplot(data, positions=levels, widths=0.6, patch_artist=True)
    
    for patch, level in zip(bp['boxes'], levels):
        patch.set_facecolor(LEVEL_COLORS.get(level, 'gray'))
        patch.set_alpha(0.7)
    
    ax.set_xlabel('Reflection Level')
    ax.set_ylabel('Confidence Std Dev (across runs)')
    ax.set_title(f'{name}')
    ax.set_xticks(levels)

plt.suptitle('Confidence Variability Across Runs by Reflection Level', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/figures/consistency_fig6_confidence_variance.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Figure 7: Mean Confidence Std by Condition (Bar Chart)

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for ax, (df, name) in zip(axes, [(ethics_df, 'ETHICS'), (mc_df, 'MoralChoice'), (morables_df, 'MORABLES')]):
    if df is None or 'confidence' not in df.columns:
        ax.text(0.5, 0.5, f'No data', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(name)
        continue
    
    valid = df[df['confidence'].notna()]
    
    # Calculate per-item confidence std
    conf_var = valid.groupby(['item_id', 'level', 'thinking'])['confidence'].std().reset_index()
    conf_var.columns = ['item_id', 'level', 'thinking', 'std_conf']
    conf_var['std_conf'] = conf_var['std_conf'].fillna(0)
    
    # Mean std by level and thinking
    mean_std = conf_var.groupby(['level', 'thinking'])['std_conf'].mean().unstack()
    mean_std.columns = ['No Thinking', 'With Thinking']
    
    mean_std.plot(kind='bar', ax=ax, color=[THINKING_COLORS[False], THINKING_COLORS[True]], 
                  edgecolor='black', linewidth=1)
    
    ax.set_xlabel('Reflection Level')
    ax.set_ylabel('Mean Confidence Std Dev')
    ax.set_title(f'{name}')
    ax.legend(title='Thinking')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0)

plt.suptitle('Mean Confidence Standard Deviation Across Runs', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/figures/consistency_fig7_mean_conf_std.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Figure 8: Confidence by Run (Line Plot)

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for ax, (df, name) in zip(axes, [(ethics_df, 'ETHICS'), (mc_df, 'MoralChoice'), (morables_df, 'MORABLES')]):
    if df is None or 'confidence' not in df.columns:
        ax.text(0.5, 0.5, f'No data', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(name)
        continue
    
    valid = df[df['confidence'].notna()]
    
    for level in sorted(valid['level'].unique()):
        level_data = valid[valid['level'] == level]
        conf_by_run = level_data.groupby('run')['confidence'].agg(['mean', 'std'])
        
        ax.errorbar(conf_by_run.index, conf_by_run['mean'], yerr=conf_by_run['std'],
                    marker='o', markersize=8, linewidth=2, capsize=4,
                    color=LEVEL_COLORS.get(level, 'gray'), label=f'Level {level}')
    
    ax.set_xlabel('Run Number')
    ax.set_ylabel('Mean Confidence (± std)')
    ax.set_title(f'{name}')
    ax.set_xticks(valid['run'].unique())
    ax.legend(loc='lower right')
    ax.set_ylim(50, 100)

plt.suptitle('Confidence Stability Across Runs', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/figures/consistency_fig8_confidence_by_run.png', dpi=300, bbox_inches='tight')
plt.show()

---
## Part 4: Consistency by Subscale/Category

In [None]:
# Figure 9: ETHICS Consistency by Subscale

if ethics_df is not None and 'subscale' in ethics_df.columns:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Recalculate consistency with subscale
    valid = ethics_df[ethics_df['extracted_answer'].notna()]
    
    cons = valid.groupby(['item_id', 'level', 'thinking', 'subscale']).agg(
        n_unique=('extracted_answer', 'nunique')
    ).reset_index()
    cons['is_consistent'] = cons['n_unique'] == 1
    
    # Consistency by subscale and level
    subscales = ['commonsense', 'deontology', 'virtue']
    levels = sorted(cons['level'].unique())
    x = np.arange(len(subscales))
    width = 0.2
    
    for i, level in enumerate(levels):
        rates = [cons[(cons['subscale'] == s) & (cons['level'] == level)]['is_consistent'].mean() * 100 
                 for s in subscales]
        bars = ax.bar(x + i*width, rates, width, label=f'Level {level}', 
                      color=LEVEL_COLORS.get(level, 'gray'), edgecolor='black')
        
        for bar, val in zip(bars, rates):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                    f'{val:.0f}', ha='center', va='bottom', fontsize=8)
    
    ax.set_xlabel('Subscale')
    ax.set_ylabel('Consistency Rate (%)')
    ax.set_title('ETHICS: Answer Consistency by Subscale and Reflection Level')
    ax.set_xticks(x + width * 1.5)
    ax.set_xticklabels([s.capitalize() for s in subscales])
    ax.legend(title='Reflection Level')
    ax.set_ylim(0, 110)
    
    plt.tight_layout()
    plt.savefig('../outputs/figures/consistency_fig9_ethics_subscale.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("ETHICS subscale data not available")

In [None]:
# Figure 10: MoralChoice Consistency by Ambiguity

if mc_df is not None and 'ambiguity' in mc_df.columns:
    fig, ax = plt.subplots(figsize=(10, 6))
    
    valid = mc_df[mc_df['extracted_answer'].notna()]
    
    cons = valid.groupby(['item_id', 'level', 'thinking', 'ambiguity']).agg(
        n_unique=('extracted_answer', 'nunique')
    ).reset_index()
    cons['is_consistent'] = cons['n_unique'] == 1
    
    # Consistency by ambiguity and level
    ambiguities = ['low', 'high']
    levels = sorted(cons['level'].unique())
    x = np.arange(len(ambiguities))
    width = 0.2
    
    for i, level in enumerate(levels):
        rates = [cons[(cons['ambiguity'] == a) & (cons['level'] == level)]['is_consistent'].mean() * 100 
                 for a in ambiguities]
        bars = ax.bar(x + i*width, rates, width, label=f'Level {level}',
                      color=LEVEL_COLORS.get(level, 'gray'), edgecolor='black')
        
        for bar, val in zip(bars, rates):
            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                    f'{val:.0f}', ha='center', va='bottom', fontsize=9)
    
    ax.set_xlabel('Ambiguity Level')
    ax.set_ylabel('Consistency Rate (%)')
    ax.set_title('MoralChoice: Answer Consistency by Ambiguity and Reflection Level')
    ax.set_xticks(x + width * 1.5)
    ax.set_xticklabels(['Low Ambiguity', 'High Ambiguity'])
    ax.legend(title='Reflection Level')
    ax.set_ylim(0, 110)
    
    plt.tight_layout()
    plt.savefig('../outputs/figures/consistency_fig10_mc_ambiguity.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("MoralChoice ambiguity data not available")

---
## Part 5: Summary Statistics

In [None]:
# Summary Table: Consistency Metrics

summary_data = []

for df, name, cons in [(ethics_df, 'ETHICS', ethics_cons), 
                        (mc_df, 'MoralChoice', mc_cons), 
                        (morables_df, 'MORABLES', morables_cons)]:
    if df is None or cons is None:
        continue
    
    for level in sorted(df['level'].unique()):
        for thinking in [False, True]:
            subset_cons = cons[(cons['level'] == level) & (cons['thinking'] == thinking)]
            
            row = {
                'Benchmark': name,
                'Level': level,
                'Thinking': 'On' if thinking else 'Off',
                'N_Items': len(subset_cons),
                'Consistency %': round(subset_cons['is_consistent'].mean() * 100, 1),
            }
            
            # Add confidence variance if available
            if 'confidence' in df.columns:
                subset_df = df[(df['level'] == level) & (df['thinking'] == thinking) & df['confidence'].notna()]
                if len(subset_df) > 0:
                    conf_std = subset_df.groupby('item_id')['confidence'].std().mean()
                    row['Mean Conf Std'] = round(conf_std, 2) if not pd.isna(conf_std) else 0
            
            summary_data.append(row)

summary_df = pd.DataFrame(summary_data)
print("Consistency Summary:")
display(summary_df)

# Save
summary_df.to_csv('../outputs/consistency_summary.csv', index=False)

In [None]:
# Key Findings

print("="*70)
print("KEY CONSISTENCY FINDINGS")
print("="*70)

for cons, name in [(ethics_cons, 'ETHICS'), (mc_cons, 'MoralChoice'), (morables_cons, 'MORABLES')]:
    if cons is None:
        continue
    
    print(f"\n{name}:")
    
    # Overall consistency
    overall = cons['is_consistent'].mean() * 100
    print(f"  Overall consistency: {overall:.1f}%")
    
    # Best and worst levels
    by_level = cons.groupby('level')['is_consistent'].mean() * 100
    best_level = by_level.idxmax()
    worst_level = by_level.idxmin()
    print(f"  Most consistent: Level {best_level} ({by_level[best_level]:.1f}%)")
    print(f"  Least consistent: Level {worst_level} ({by_level[worst_level]:.1f}%)")
    
    # Thinking effect
    no_think = cons[cons['thinking'] == False]['is_consistent'].mean() * 100
    with_think = cons[cons['thinking'] == True]['is_consistent'].mean() * 100
    print(f"  No thinking: {no_think:.1f}% | With thinking: {with_think:.1f}% (Δ = {with_think - no_think:+.1f}%)")

In [None]:
# Figure list
print("\n" + "="*70)
print("FIGURES GENERATED")
print("="*70)

figures = [
    ("consistency_fig1_by_level.png", "Answer consistency by reflection level"),
    ("consistency_fig2_thinking_interaction.png", "Consistency: thinking × level interaction"),
    ("consistency_fig3_heatmap.png", "Consistency heatmap across all conditions"),
    ("consistency_fig4_accuracy_by_run.png", "Accuracy stability across runs"),
    ("consistency_fig5_variance_rate.png", "Answer variance rate by level"),
    ("consistency_fig6_confidence_variance.png", "Confidence variance box plots"),
    ("consistency_fig7_mean_conf_std.png", "Mean confidence std by condition"),
    ("consistency_fig8_confidence_by_run.png", "Confidence stability across runs"),
    ("consistency_fig9_ethics_subscale.png", "ETHICS consistency by subscale"),
    ("consistency_fig10_mc_ambiguity.png", "MoralChoice consistency by ambiguity"),
]

for fname, desc in figures:
    path = Path(f'../outputs/figures/{fname}')
    status = "✓" if path.exists() else "○"
    print(f"  {status} {fname}: {desc}")