# Gemini 3 Flash Experiment Analysis

This notebook analyzes the results of Gemini 3 Flash experiments across moral reasoning benchmarks.

**Experiment Design:**
- Fixed Chain-of-Thought prompts (level 2)
- Varying Gemini's `thinking_level` parameter: minimal, low, medium, high
- This isolates the effect of internal reasoning budget from prompt structure

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Style settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

## 1. Load Data

In [None]:
# Load Gemini results
results_dir = Path('../results/processed')
checkpoint_dir = Path('../results/raw')

# Try processed first, then checkpoints
def load_gemini_data(benchmark):
    processed_path = results_dir / f'gemini_{benchmark}_results.csv'
    checkpoint_path = checkpoint_dir / f'gemini_{benchmark}_checkpoint.csv'
    
    if processed_path.exists():
        return pd.read_csv(processed_path)
    elif checkpoint_path.exists():
        return pd.read_csv(checkpoint_path)
    return None

ethics_df = load_gemini_data('ethics')
moralchoice_df = load_gemini_data('moralchoice')
morables_df = load_gemini_data('morables')

# Summary
print("Gemini Experiment Data Summary")
print("=" * 40)
for name, df in [('ETHICS', ethics_df), ('MoralChoice', moralchoice_df), ('MORABLES', morables_df)]:
    if df is not None:
        print(f"{name}: {len(df)} observations")
        if 'thinking_level' in df.columns:
            print(f"  Thinking levels: {df['thinking_level'].unique().tolist()}")
        if 'run' in df.columns:
            print(f"  Runs: {df['run'].nunique()}")
    else:
        print(f"{name}: No data found")

## 2. Accuracy by Thinking Level

In [None]:
# Define thinking level order for consistent plotting
THINKING_LEVEL_ORDER = ['minimal', 'low', 'medium', 'high']

def calculate_accuracy_by_thinking_level(df, benchmark_name):
    """Calculate accuracy statistics by thinking level."""
    if df is None or 'correct' not in df.columns:
        return None
    
    # Filter valid responses
    valid_df = df[df['correct'].notna()].copy()
    
    # Group by thinking_level
    stats = valid_df.groupby('thinking_level').agg(
        accuracy=('correct', 'mean'),
        n=('correct', 'count'),
        std=('correct', 'std')
    ).reset_index()
    
    # Calculate standard error
    stats['se'] = stats['std'] / np.sqrt(stats['n'])
    stats['benchmark'] = benchmark_name
    
    # Order by thinking level
    stats['thinking_level'] = pd.Categorical(
        stats['thinking_level'], 
        categories=THINKING_LEVEL_ORDER, 
        ordered=True
    )
    stats = stats.sort_values('thinking_level')
    
    return stats

# Calculate for each benchmark
ethics_acc = calculate_accuracy_by_thinking_level(ethics_df, 'ETHICS')
morables_acc = calculate_accuracy_by_thinking_level(morables_df, 'MORABLES')

# Display results
print("\nAccuracy by Thinking Level")
print("=" * 50)
for name, acc_df in [('ETHICS', ethics_acc), ('MORABLES', morables_acc)]:
    if acc_df is not None:
        print(f"\n{name}:")
        for _, row in acc_df.iterrows():
            print(f"  {row['thinking_level']:8s}: {row['accuracy']:.3f} ± {row['se']:.3f} (n={int(row['n'])})")

In [None]:
# Plot accuracy by thinking level
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

benchmarks = [
    ('ETHICS', ethics_acc, axes[0]),
    ('MORABLES', morables_acc, axes[1])
]

colors = {'minimal': '#e74c3c', 'low': '#f39c12', 'medium': '#3498db', 'high': '#27ae60'}

for name, acc_df, ax in benchmarks:
    if acc_df is not None:
        x = range(len(acc_df))
        bars = ax.bar(x, acc_df['accuracy'], 
                      yerr=acc_df['se'] * 1.96,  # 95% CI
                      color=[colors.get(tl, 'gray') for tl in acc_df['thinking_level']],
                      capsize=5, alpha=0.8)
        ax.set_xticks(x)
        ax.set_xticklabels(acc_df['thinking_level'], rotation=0)
        ax.set_ylabel('Accuracy')
        ax.set_xlabel('Thinking Level')
        ax.set_title(f'{name} Accuracy by Thinking Level')
        ax.set_ylim(0, 1)
        ax.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5, label='Chance')
        
        # Add value labels
        for i, (_, row) in enumerate(acc_df.iterrows()):
            ax.text(i, row['accuracy'] + 0.05, f"{row['accuracy']:.2f}", 
                    ha='center', va='bottom', fontsize=10)
    else:
        ax.text(0.5, 0.5, 'No data', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(f'{name} (No data)')

plt.tight_layout()
plt.savefig('../outputs/gemini_accuracy_by_thinking_level.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Response Characteristics by Thinking Level

In [None]:
def analyze_response_characteristics(df, name):
    """Analyze response length, tokens, and markers by thinking level."""
    if df is None:
        return None
    
    metrics = ['response_length', 'output_tokens', 'reasoning_markers', 'uncertainty_markers']
    available_metrics = [m for m in metrics if m in df.columns]
    
    if not available_metrics:
        return None
    
    stats = df.groupby('thinking_level')[available_metrics].agg(['mean', 'std']).reset_index()
    stats.columns = ['_'.join(col).strip('_') for col in stats.columns]
    
    # Order by thinking level
    stats['thinking_level'] = pd.Categorical(
        stats['thinking_level'], 
        categories=THINKING_LEVEL_ORDER, 
        ordered=True
    )
    return stats.sort_values('thinking_level')

# Analyze each benchmark
for name, df in [('ETHICS', ethics_df), ('MoralChoice', moralchoice_df), ('MORABLES', morables_df)]:
    stats = analyze_response_characteristics(df, name)
    if stats is not None:
        print(f"\n{name} Response Characteristics:")
        print(stats.to_string(index=False))

In [None]:
# Plot response length distribution by thinking level
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for ax, (name, df) in zip(axes, [('ETHICS', ethics_df), ('MoralChoice', moralchoice_df), ('MORABLES', morables_df)]):
    if df is not None and 'response_length' in df.columns:
        # Order the data
        df_plot = df.copy()
        df_plot['thinking_level'] = pd.Categorical(
            df_plot['thinking_level'], 
            categories=THINKING_LEVEL_ORDER, 
            ordered=True
        )
        
        sns.boxplot(data=df_plot, x='thinking_level', y='response_length', 
                    ax=ax, palette=colors, order=THINKING_LEVEL_ORDER)
        ax.set_title(f'{name} Response Length')
        ax.set_xlabel('Thinking Level')
        ax.set_ylabel('Response Length (words)')
    else:
        ax.text(0.5, 0.5, 'No data', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(f'{name} (No data)')

plt.tight_layout()
plt.savefig('../outputs/gemini_response_length_by_thinking.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Confidence Analysis

In [None]:
def analyze_confidence(df, name):
    """Analyze confidence by thinking level and correctness."""
    if df is None or 'confidence' not in df.columns:
        return None
    
    # Filter valid confidence scores
    valid_df = df[df['confidence'].notna()].copy()
    
    if len(valid_df) == 0:
        return None
    
    # Overall confidence by thinking level
    conf_stats = valid_df.groupby('thinking_level').agg(
        mean_confidence=('confidence', 'mean'),
        std_confidence=('confidence', 'std'),
        n=('confidence', 'count')
    ).reset_index()
    
    # Order
    conf_stats['thinking_level'] = pd.Categorical(
        conf_stats['thinking_level'], 
        categories=THINKING_LEVEL_ORDER, 
        ordered=True
    )
    
    return conf_stats.sort_values('thinking_level')

# Analyze confidence
print("Confidence by Thinking Level")
print("=" * 50)
for name, df in [('ETHICS', ethics_df), ('MoralChoice', moralchoice_df), ('MORABLES', morables_df)]:
    conf_stats = analyze_confidence(df, name)
    if conf_stats is not None:
        print(f"\n{name}:")
        for _, row in conf_stats.iterrows():
            print(f"  {row['thinking_level']:8s}: {row['mean_confidence']:.1f} ± {row['std_confidence']:.1f}")

In [None]:
# Confidence calibration: confidence vs accuracy
def plot_calibration(df, name, ax):
    """Plot confidence calibration by thinking level."""
    if df is None or 'confidence' not in df.columns or 'correct' not in df.columns:
        ax.text(0.5, 0.5, 'No data', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(f'{name} (No data)')
        return
    
    valid_df = df[df['confidence'].notna() & df['correct'].notna()].copy()
    
    if len(valid_df) == 0:
        ax.text(0.5, 0.5, 'No valid data', ha='center', va='center', transform=ax.transAxes)
        return
    
    # Calculate mean confidence and accuracy by thinking level
    cal_data = valid_df.groupby('thinking_level').agg(
        confidence=('confidence', 'mean'),
        accuracy=('correct', 'mean')
    ).reset_index()
    
    # Plot
    for _, row in cal_data.iterrows():
        ax.scatter(row['confidence'], row['accuracy'] * 100, 
                   s=150, label=row['thinking_level'],
                   color=colors.get(row['thinking_level'], 'gray'),
                   edgecolor='black', linewidth=1)
    
    # Perfect calibration line
    ax.plot([0, 100], [0, 100], 'k--', alpha=0.5, label='Perfect calibration')
    
    ax.set_xlabel('Mean Confidence')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title(f'{name} Calibration')
    ax.set_xlim(0, 105)
    ax.set_ylim(0, 105)
    ax.legend(loc='lower right')

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for ax, (name, df) in zip(axes, [('ETHICS', ethics_df), ('MoralChoice', moralchoice_df), ('MORABLES', morables_df)]):
    plot_calibration(df, name, ax)

plt.tight_layout()
plt.savefig('../outputs/gemini_calibration.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Token Usage Analysis

In [None]:
def analyze_tokens(df, name):
    """Analyze token usage by thinking level."""
    if df is None:
        return None
    
    token_cols = [c for c in ['input_tokens', 'output_tokens'] if c in df.columns]
    if not token_cols:
        return None
    
    stats = df.groupby('thinking_level')[token_cols].agg(['mean', 'sum']).reset_index()
    stats.columns = ['_'.join(col).strip('_') for col in stats.columns]
    
    # Order
    stats['thinking_level'] = pd.Categorical(
        stats['thinking_level'], 
        categories=THINKING_LEVEL_ORDER, 
        ordered=True
    )
    return stats.sort_values('thinking_level')

print("Token Usage by Thinking Level")
print("=" * 60)
for name, df in [('ETHICS', ethics_df), ('MoralChoice', moralchoice_df), ('MORABLES', morables_df)]:
    token_stats = analyze_tokens(df, name)
    if token_stats is not None:
        print(f"\n{name}:")
        print(token_stats.to_string(index=False))

## 6. Consistency Across Runs

In [None]:
def analyze_consistency(df, name):
    """Analyze answer consistency across runs for each item."""
    if df is None or 'run' not in df.columns or 'extracted_answer' not in df.columns:
        return None
    
    # Count unique answers per item per thinking level
    consistency = df.groupby(['thinking_level', 'item_id']).agg(
        n_runs=('run', 'nunique'),
        n_unique_answers=('extracted_answer', 'nunique')
    ).reset_index()
    
    # Calculate consistency rate (items where all runs gave same answer)
    consistency['consistent'] = consistency['n_unique_answers'] == 1
    
    summary = consistency.groupby('thinking_level').agg(
        consistency_rate=('consistent', 'mean'),
        n_items=('item_id', 'count')
    ).reset_index()
    
    # Order
    summary['thinking_level'] = pd.Categorical(
        summary['thinking_level'], 
        categories=THINKING_LEVEL_ORDER, 
        ordered=True
    )
    return summary.sort_values('thinking_level')

print("Consistency Across Runs (Same Answer Rate)")
print("=" * 50)
for name, df in [('ETHICS', ethics_df), ('MoralChoice', moralchoice_df), ('MORABLES', morables_df)]:
    cons = analyze_consistency(df, name)
    if cons is not None:
        print(f"\n{name}:")
        for _, row in cons.iterrows():
            print(f"  {row['thinking_level']:8s}: {row['consistency_rate']:.1%}")

## 7. Summary Statistics

In [None]:
# Create summary table
def create_summary_table():
    """Create a comprehensive summary table."""
    rows = []
    
    for name, df in [('ETHICS', ethics_df), ('MoralChoice', moralchoice_df), ('MORABLES', morables_df)]:
        if df is None:
            continue
            
        for tl in THINKING_LEVEL_ORDER:
            tl_df = df[df['thinking_level'] == tl]
            if len(tl_df) == 0:
                continue
            
            row = {
                'Benchmark': name,
                'Thinking Level': tl,
                'N': len(tl_df),
            }
            
            # Accuracy (if applicable)
            if 'correct' in tl_df.columns:
                valid = tl_df[tl_df['correct'].notna()]
                if len(valid) > 0:
                    row['Accuracy'] = f"{valid['correct'].mean():.3f}"
            
            # Confidence
            if 'confidence' in tl_df.columns:
                conf_valid = tl_df[tl_df['confidence'].notna()]
                if len(conf_valid) > 0:
                    row['Mean Confidence'] = f"{conf_valid['confidence'].mean():.1f}"
            
            # Response length
            if 'response_length' in tl_df.columns:
                row['Avg Response Length'] = f"{tl_df['response_length'].mean():.0f}"
            
            # Output tokens
            if 'output_tokens' in tl_df.columns:
                row['Avg Output Tokens'] = f"{tl_df['output_tokens'].mean():.0f}"
            
            rows.append(row)
    
    return pd.DataFrame(rows)

summary_table = create_summary_table()
print("\nGemini Experiment Summary")
print("=" * 80)
print(summary_table.to_string(index=False))

In [None]:
# Save summary table
if len(summary_table) > 0:
    summary_table.to_csv('../outputs/gemini_summary.csv', index=False)
    print("Summary saved to ../outputs/gemini_summary.csv")

## 8. Key Findings

Based on the analysis above, summarize the key findings:

1. **Effect of Thinking Level on Accuracy**: Does increasing thinking_level improve accuracy?
2. **Response Characteristics**: How do response length and reasoning markers change with thinking level?
3. **Confidence Calibration**: Is Gemini well-calibrated? Does this vary by thinking level?
4. **Consistency**: Are higher thinking levels more consistent across runs?
5. **Cost-Benefit**: Is the additional token cost of higher thinking levels justified by accuracy gains?

In [None]:
# Generate key findings programmatically
print("\n" + "=" * 60)
print("KEY FINDINGS")
print("=" * 60)

# Check if accuracy improves with thinking level
for name, df in [('ETHICS', ethics_df), ('MORABLES', morables_df)]:
    if df is not None and 'correct' in df.columns:
        valid = df[df['correct'].notna()]
        if len(valid) > 0:
            acc_by_tl = valid.groupby('thinking_level')['correct'].mean()
            
            # Check if ordered
            minimal = acc_by_tl.get('minimal', 0)
            high = acc_by_tl.get('high', 0)
            
            if high > minimal:
                print(f"\n{name}: Higher thinking levels improve accuracy")
                print(f"  minimal -> high: {minimal:.3f} -> {high:.3f} (+{(high-minimal)*100:.1f}pp)")
            elif high < minimal:
                print(f"\n{name}: Higher thinking levels decrease accuracy")
                print(f"  minimal -> high: {minimal:.3f} -> {high:.3f} ({(high-minimal)*100:.1f}pp)")
            else:
                print(f"\n{name}: No clear effect of thinking level on accuracy")