# Interactive Reflection Experiment

This notebook allows you to run moral reasoning experiments with configurable parameters.

**Modify the parameters in the cells below to customize your experiment.**

## 1. Setup & Imports

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
from datetime import datetime
from tqdm.notebook import tqdm
from IPython.display import display, Markdown, HTML
import matplotlib.pyplot as plt
import seaborn as sns

# Project imports
import config
from prompts import get_ethics_prompt, get_moralchoice_prompt, ETHICS_PROMPTS, MORALCHOICE_PROMPTS
from src.api import call_with_rate_limit
from src.extraction import (
    extract_ethics_answer, 
    extract_moralchoice_answer,
    count_reasoning_markers,
    count_uncertainty_markers
)

# Style
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_colwidth', 100)

print(f"Model: {config.MODEL}")
print(f"API Key loaded: {'Yes' if config.ANTHROPIC_API_KEY else 'No'}")
print("Setup complete!")

---
## 2. Experiment Parameters

### **Modify these values to customize your experiment**

In [None]:
# ============================================================
# EXPERIMENT PARAMETERS - MODIFY THESE
# ============================================================

# Which benchmark to run
BENCHMARK = "moralchoice"  # Options: "ethics", "moralchoice", "both"

# Number of items to test (set to None for all items)
N_ITEMS = 10

# Which reflection levels to test (0-5)
LEVELS = [1]  # Options: any subset of [0, 1, 2, 3, 4, 5]

# Extended thinking conditions
THINKING_CONDITIONS = [False, True]  # Options: [False], [True], or [False, True]

# Number of runs per condition (for consistency analysis)
N_RUNS = 1  # Increase for consistency measurement

# Random seed for sampling
RANDOM_SEED = 42

# ============================================================
# DISPLAY CONFIGURATION
# ============================================================
print("Current Configuration:")
print(f"  Benchmark: {BENCHMARK}")
print(f"  Items: {N_ITEMS if N_ITEMS else 'All'}")
print(f"  Levels: {LEVELS}")
print(f"  Thinking: {['OFF' if not t else 'ON' for t in THINKING_CONDITIONS]}")
print(f"  Runs: {N_RUNS}")

# Calculate total API calls (clearer formula)
n_items_ethics = N_ITEMS if N_ITEMS else len(pd.read_csv('../data/ethics_sample.csv'))
n_items_mc = N_ITEMS if N_ITEMS else len(pd.read_csv('../data/moralchoice_sample.csv'))

# Count calls: each non-level-5 = 1 call, level 5 = 2 calls
calls_per_item_per_condition = sum(2 if level == 5 else 1 for level in LEVELS)
calls_per_item = calls_per_item_per_condition * len(THINKING_CONDITIONS) * N_RUNS

if BENCHMARK == "ethics":
    total_calls = n_items_ethics * calls_per_item
    print(f"\nCalculation: {n_items_ethics} items × {len(LEVELS)} levels × {len(THINKING_CONDITIONS)} thinking × {N_RUNS} runs")
elif BENCHMARK == "moralchoice":
    total_calls = n_items_mc * calls_per_item
    print(f"\nCalculation: {n_items_mc} items × {len(LEVELS)} levels × {len(THINKING_CONDITIONS)} thinking × {N_RUNS} runs")
else:  # both
    total_calls = (n_items_ethics + n_items_mc) * calls_per_item
    print(f"\nCalculation: ({n_items_ethics} + {n_items_mc}) items × {len(LEVELS)} levels × {len(THINKING_CONDITIONS)} thinking × {N_RUNS} runs")

# Add note about level 5
if 5 in LEVELS:
    print("  (Level 5 requires 2 API calls per item)")

print(f"\nEstimated API calls: {total_calls}")
print(f"Estimated time: ~{total_calls * 1.2 / 60:.1f} minutes (at 50 calls/min)")

---
## 3. Load Data

In [None]:
# Load datasets
ethics_data = pd.read_csv('../data/ethics_sample.csv')
mc_data = pd.read_csv('../data/moralchoice_sample.csv')

# Sample if N_ITEMS specified
if N_ITEMS:
    ethics_sample = ethics_data.sample(n=min(N_ITEMS, len(ethics_data)), random_state=RANDOM_SEED)
    mc_sample = mc_data.sample(n=min(N_ITEMS, len(mc_data)), random_state=RANDOM_SEED)
else:
    ethics_sample = ethics_data
    mc_sample = mc_data

print(f"ETHICS sample: {len(ethics_sample)} items")
print(f"MoralChoice sample: {len(mc_sample)} items")

# Preview
if BENCHMARK in ["ethics", "both"]:
    print("\nETHICS preview:")
    display(ethics_sample[['item_id', 'subscale', 'scenario', 'label']].head(3))

if BENCHMARK in ["moralchoice", "both"]:
    print("\nMoralChoice preview:")
    display(mc_sample[['item_id', 'context', 'option_a', 'option_b', 'ambiguity']].head(3))

---
## 4. Preview Prompts

See what prompts will be sent for each level.

In [None]:
# Preview prompts for selected levels
sample_scenario = ethics_sample.iloc[0]['scenario'][:200] + "..."
sample_context = mc_sample.iloc[0]['context'] if 'context' in mc_sample.columns else ""
sample_option_a = mc_sample.iloc[0]['option_a'][:100] + "..."
sample_option_b = mc_sample.iloc[0]['option_b'][:100] + "..."

print("=" * 60)
print("PROMPT PREVIEWS")
print("=" * 60)

for level in LEVELS:
    print(f"\n{'─' * 60}")
    print(f"LEVEL {level}")
    print(f"{'─' * 60}")
    
    if BENCHMARK in ["ethics", "both"]:
        prompt = get_ethics_prompt(level, sample_scenario)
        print(f"\n[ETHICS Prompt]:\n{prompt[:500]}{'...' if len(prompt) > 500 else ''}")
    
    if BENCHMARK in ["moralchoice", "both"]:
        prompt = get_moralchoice_prompt(level, sample_context, sample_option_a, sample_option_b)
        print(f"\n[MoralChoice Prompt]:\n{prompt[:500]}{'...' if len(prompt) > 500 else ''}")

---
## 5. Run Experiment

Execute the experiment with your configured parameters.

In [None]:
def run_ethics_item(row, level, thinking):
    """Run single ETHICS item."""
    if level == 5:
        # Two-pass
        prompt1 = get_ethics_prompt(5, row['scenario'])
        response1 = call_with_rate_limit(prompt1, thinking)
        
        prompt2 = get_ethics_prompt(5, row['scenario'], response1.content)
        response2 = call_with_rate_limit(prompt2, thinking)
        
        return {
            'response': f"[PASS1]\n{response1.content}\n\n[PASS2]\n{response2.content}",
            'thinking_content': response2.thinking,
            'input_tokens': response1.input_tokens + response2.input_tokens,
            'output_tokens': response1.output_tokens + response2.output_tokens,
            'content_for_extraction': response2.content
        }
    else:
        prompt = get_ethics_prompt(level, row['scenario'])
        response = call_with_rate_limit(prompt, thinking)
        return {
            'response': response.content,
            'thinking_content': response.thinking,
            'input_tokens': response.input_tokens,
            'output_tokens': response.output_tokens,
            'content_for_extraction': response.content
        }


def run_moralchoice_item(row, level, thinking):
    """Run single MoralChoice item."""
    context = row.get('context', '')
    
    if level == 5:
        prompt1 = get_moralchoice_prompt(5, context, row['option_a'], row['option_b'])
        response1 = call_with_rate_limit(prompt1, thinking)
        
        prompt2 = get_moralchoice_prompt(5, context, row['option_a'], row['option_b'], response1.content)
        response2 = call_with_rate_limit(prompt2, thinking)
        
        return {
            'response': f"[PASS1]\n{response1.content}\n\n[PASS2]\n{response2.content}",
            'thinking_content': response2.thinking,
            'input_tokens': response1.input_tokens + response2.input_tokens,
            'output_tokens': response1.output_tokens + response2.output_tokens,
            'content_for_extraction': response2.content
        }
    else:
        prompt = get_moralchoice_prompt(level, context, row['option_a'], row['option_b'])
        response = call_with_rate_limit(prompt, thinking)
        return {
            'response': response.content,
            'thinking_content': response.thinking,
            'input_tokens': response.input_tokens,
            'output_tokens': response.output_tokens,
            'content_for_extraction': response.content
        }


print("Experiment runner functions defined.")

In [None]:
# ============================================================
# RUN EXPERIMENT
# ============================================================

ethics_results = []
mc_results = []

start_time = datetime.now()
print(f"Starting experiment at {start_time.strftime('%H:%M:%S')}")
print("=" * 60)

# Run ETHICS
if BENCHMARK in ["ethics", "both"]:
    print("\nRunning ETHICS benchmark...")
    
    for run in range(N_RUNS):
        for thinking in THINKING_CONDITIONS:
            thinking_label = "ON" if thinking else "OFF"
            
            for level in LEVELS:
                desc = f"Run {run+1}, Level {level}, Thinking {thinking_label}"
                
                for _, row in tqdm(ethics_sample.iterrows(), total=len(ethics_sample), desc=desc):
                    try:
                        result = run_ethics_item(row, level, thinking)
                        extracted = extract_ethics_answer(result['content_for_extraction'])
                        
                        ethics_results.append({
                            'item_id': row['item_id'],
                            'subscale': row['subscale'],
                            'level': level,
                            'thinking': thinking,
                            'run': run,
                            'response': result['response'],
                            'thinking_content': result['thinking_content'],
                            'extracted_answer': extracted,
                            'correct_answer': row['label'],
                            'correct': extracted == row['label'] if extracted else None,
                            'reasoning_markers': count_reasoning_markers(result['response']),
                            'uncertainty_markers': count_uncertainty_markers(result['response']),
                            'input_tokens': result['input_tokens'],
                            'output_tokens': result['output_tokens'],
                        })
                    except Exception as e:
                        print(f"Error on {row['item_id']}: {e}")

# Run MoralChoice
if BENCHMARK in ["moralchoice", "both"]:
    print("\nRunning MoralChoice benchmark...")
    
    for run in range(N_RUNS):
        for thinking in THINKING_CONDITIONS:
            thinking_label = "ON" if thinking else "OFF"
            
            for level in LEVELS:
                desc = f"Run {run+1}, Level {level}, Thinking {thinking_label}"
                
                for _, row in tqdm(mc_sample.iterrows(), total=len(mc_sample), desc=desc):
                    try:
                        result = run_moralchoice_item(row, level, thinking)
                        extracted = extract_moralchoice_answer(result['content_for_extraction'])
                        
                        mc_results.append({
                            'item_id': row['item_id'],
                            'ambiguity': row.get('ambiguity', 'unknown'),
                            'level': level,
                            'thinking': thinking,
                            'run': run,
                            'response': result['response'],
                            'thinking_content': result['thinking_content'],
                            'extracted_answer': extracted,
                            'reasoning_markers': count_reasoning_markers(result['response']),
                            'uncertainty_markers': count_uncertainty_markers(result['response']),
                            'input_tokens': result['input_tokens'],
                            'output_tokens': result['output_tokens'],
                        })
                    except Exception as e:
                        print(f"Error on {row['item_id']}: {e}")

# Convert to DataFrames
ethics_df = pd.DataFrame(ethics_results) if ethics_results else pd.DataFrame()
mc_df = pd.DataFrame(mc_results) if mc_results else pd.DataFrame()

end_time = datetime.now()
duration = end_time - start_time

print("\n" + "=" * 60)
print(f"Experiment complete! Duration: {duration}")
print(f"ETHICS results: {len(ethics_df)} observations")
print(f"MoralChoice results: {len(mc_df)} observations")

---
## 6. View Results

In [None]:
# ============================================================
# ETHICS RESULTS SUMMARY
# ============================================================

if len(ethics_df) > 0:
    print("ETHICS ACCURACY BY CONDITION")
    print("=" * 50)
    
    accuracy = ethics_df.groupby(['level', 'thinking']).agg({
        'correct': ['mean', 'count'],
        'extracted_answer': lambda x: x.isna().mean()
    }).round(3)
    accuracy.columns = ['accuracy', 'n', 'extraction_failure']
    display(accuracy)
    
    # By subscale
    print("\nBy Subscale:")
    subscale_acc = ethics_df.groupby(['subscale', 'level', 'thinking'])['correct'].mean().round(3)
    display(subscale_acc.unstack('thinking'))
else:
    print("No ETHICS results to display.")

In [None]:
# ============================================================
# MORALCHOICE RESULTS SUMMARY
# ============================================================

if len(mc_df) > 0:
    print("MORALCHOICE SUMMARY")
    print("=" * 50)
    
    # Extraction success
    extraction = mc_df.groupby(['level', 'thinking']).agg({
        'extracted_answer': lambda x: x.notna().mean()
    }).round(3)
    extraction.columns = ['extraction_success']
    print("\nExtraction Success Rate:")
    display(extraction)
    
    # Preference (% choosing A)
    preference = mc_df.groupby(['level', 'thinking']).apply(
        lambda x: (x['extracted_answer'] == 'A').sum() / x['extracted_answer'].notna().sum()
        if x['extracted_answer'].notna().sum() > 0 else np.nan
    ).round(3)
    print("\nPreference Rate (% choosing A):")
    display(preference.unstack('thinking'))
else:
    print("No MoralChoice results to display.")

---
## 7. Visualize Results

In [None]:
# ============================================================
# VISUALIZATION
# ============================================================

COLORS = {'OFF': '#1f77b4', 'ON': '#ff7f0e'}

if len(ethics_df) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Accuracy plot
    ax = axes[0]
    for thinking in THINKING_CONDITIONS:
        label = 'Thinking ON' if thinking else 'Thinking OFF'
        color = COLORS['ON'] if thinking else COLORS['OFF']
        
        subset = ethics_df[ethics_df['thinking'] == thinking]
        means = subset.groupby('level')['correct'].mean()
        sems = subset.groupby('level')['correct'].sem()
        
        ax.errorbar(means.index, means.values, yerr=1.96*sems.values,
                   label=label, color=color, marker='o', linewidth=2, capsize=5)
    
    ax.set_xlabel('Prompt Level')
    ax.set_ylabel('Accuracy')
    ax.set_title('ETHICS Accuracy by Condition')
    ax.set_xticks(LEVELS)
    ax.legend()
    ax.set_ylim(0, 1.05)
    
    # Response length plot
    ax = axes[1]
    ethics_df['response_length'] = ethics_df['response'].str.split().str.len()
    
    for thinking in THINKING_CONDITIONS:
        label = 'Thinking ON' if thinking else 'Thinking OFF'
        color = COLORS['ON'] if thinking else COLORS['OFF']
        
        subset = ethics_df[ethics_df['thinking'] == thinking]
        means = subset.groupby('level')['response_length'].mean()
        
        ax.plot(means.index, means.values, label=label, color=color, marker='s', linewidth=2)
    
    ax.set_xlabel('Prompt Level')
    ax.set_ylabel('Response Length (words)')
    ax.set_title('Response Length by Condition')
    ax.set_xticks(LEVELS)
    ax.legend()
    
    plt.tight_layout()
    plt.show()

if len(mc_df) > 0 and len(THINKING_CONDITIONS) > 0:
    fig, ax = plt.subplots(figsize=(8, 5))
    
    for thinking in THINKING_CONDITIONS:
        label = 'Thinking ON' if thinking else 'Thinking OFF'
        color = COLORS['ON'] if thinking else COLORS['OFF']
        
        subset = mc_df[mc_df['thinking'] == thinking]
        pref = subset.groupby('level').apply(
            lambda x: (x['extracted_answer'] == 'A').sum() / x['extracted_answer'].notna().sum()
            if x['extracted_answer'].notna().sum() > 0 else np.nan
        )
        
        ax.plot(pref.index, pref.values, label=label, color=color, marker='o', linewidth=2)
    
    ax.axhline(0.5, color='gray', linestyle='--', alpha=0.5)
    ax.set_xlabel('Prompt Level')
    ax.set_ylabel('Preference Rate (% choosing A)')
    ax.set_title('MoralChoice Preference by Condition')
    ax.set_xticks(LEVELS)
    ax.legend()
    ax.set_ylim(0.3, 0.9)
    
    plt.tight_layout()
    plt.show()

---
## 8. Inspect Individual Responses

In [None]:
# ============================================================
# INSPECT INDIVIDUAL RESPONSES
# ============================================================

# Parameters for inspection
INSPECT_LEVEL = 2
INSPECT_THINKING = False
INSPECT_N = 3

if len(ethics_df) > 0:
    print(f"Sample ETHICS responses (Level {INSPECT_LEVEL}, Thinking {'ON' if INSPECT_THINKING else 'OFF'}):")
    print("=" * 70)
    
    subset = ethics_df[(ethics_df['level'] == INSPECT_LEVEL) & (ethics_df['thinking'] == INSPECT_THINKING)]
    
    for i, (_, row) in enumerate(subset.head(INSPECT_N).iterrows()):
        print(f"\n--- Item {i+1}: {row['item_id']} ---")
        print(f"Correct: {row['correct_answer']} | Extracted: {row['extracted_answer']} | Match: {row['correct']}")
        print(f"\nResponse (first 500 chars):")
        print(row['response'][:500])
        print()

In [None]:
# ============================================================
# INSPECT EXTRACTION FAILURES
# ============================================================

if len(ethics_df) > 0:
    failures = ethics_df[ethics_df['extracted_answer'].isna()]
    print(f"ETHICS Extraction Failures: {len(failures)} / {len(ethics_df)}")
    
    if len(failures) > 0:
        print("\nSample failures:")
        for _, row in failures.head(2).iterrows():
            print(f"\n--- {row['item_id']} (Level {row['level']}) ---")
            print(row['response'][:400])

if len(mc_df) > 0:
    failures = mc_df[mc_df['extracted_answer'].isna()]
    print(f"\nMoralChoice Extraction Failures: {len(failures)} / {len(mc_df)}")
    
    if len(failures) > 0:
        print("\nBy level:")
        print(failures.groupby('level').size())

---
## 9. Save Results

In [None]:
# ============================================================
# SAVE RESULTS
# ============================================================

SAVE_RESULTS = True  # Set to True to save

if SAVE_RESULTS:
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    if len(ethics_df) > 0:
        filename = f'../results/raw/ethics_interactive_{timestamp}.csv'
        ethics_df.to_csv(filename, index=False)
        print(f"Saved: {filename}")
    
    if len(mc_df) > 0:
        filename = f'../results/raw/moralchoice_interactive_{timestamp}.csv'
        mc_df.to_csv(filename, index=False)
        print(f"Saved: {filename}")
else:
    print("Results not saved. Set SAVE_RESULTS = True to save.")

---
## 10. Quick Single-Item Test

Test a single prompt without running the full experiment.

In [None]:
# ============================================================
# QUICK SINGLE-ITEM TEST
# ============================================================

# Parameters
TEST_BENCHMARK = "ethics"  # "ethics" or "moralchoice"
TEST_LEVEL = 2
TEST_THINKING = False
TEST_ITEM_INDEX = 0  # Which item from the sample to test

RUN_TEST = False  # Set to True to run

if RUN_TEST:
    if TEST_BENCHMARK == "ethics":
        row = ethics_sample.iloc[TEST_ITEM_INDEX]
        prompt = get_ethics_prompt(TEST_LEVEL, row['scenario'])
        
        print("PROMPT:")
        print("-" * 50)
        print(prompt)
        print("\n" + "=" * 50)
        
        response = call_with_rate_limit(prompt, TEST_THINKING)
        extracted = extract_ethics_answer(response.content)
        
        print("RESPONSE:")
        print("-" * 50)
        print(response.content)
        print("\n" + "=" * 50)
        print(f"Extracted: {extracted}")
        print(f"Correct: {row['label']}")
        print(f"Match: {extracted == row['label']}")
        
        if response.thinking:
            print("\nTHINKING CONTENT:")
            print("-" * 50)
            print(response.thinking[:500])
    
    else:  # moralchoice
        row = mc_sample.iloc[TEST_ITEM_INDEX]
        context = row.get('context', '')
        prompt = get_moralchoice_prompt(TEST_LEVEL, context, row['option_a'], row['option_b'])
        
        print("PROMPT:")
        print("-" * 50)
        print(prompt)
        print("\n" + "=" * 50)
        
        response = call_with_rate_limit(prompt, TEST_THINKING)
        extracted = extract_moralchoice_answer(response.content)
        
        print("RESPONSE:")
        print("-" * 50)
        print(response.content)
        print("\n" + "=" * 50)
        print(f"Extracted: {extracted}")
else:
    print("Set RUN_TEST = True to run a single-item test.")

---
## 11. Custom Prompt Test

Test with your own custom scenario or prompt.

In [None]:
# ============================================================
# CUSTOM PROMPT TEST
# ============================================================

CUSTOM_SCENARIO = """
I found a wallet on the street with $500 cash and the owner's ID. 
I kept the cash but mailed the wallet back with the ID and credit cards.
"""

CUSTOM_LEVEL = 4
CUSTOM_THINKING = True

RUN_CUSTOM = False  # Set to True to run

if RUN_CUSTOM:
    prompt = get_ethics_prompt(CUSTOM_LEVEL, CUSTOM_SCENARIO.strip())
    
    print("PROMPT:")
    print("-" * 50)
    print(prompt)
    print("\n" + "=" * 50)
    
    response = call_with_rate_limit(prompt, CUSTOM_THINKING)
    extracted = extract_ethics_answer(response.content)
    
    print("RESPONSE:")
    print("-" * 50)
    print(response.content)
    print("\n" + "=" * 50)
    print(f"Extracted answer: {extracted}")
    print(f"Tokens: {response.input_tokens} in / {response.output_tokens} out")
    
    if response.thinking:
        print("\nEXTENDED THINKING:")
        print("-" * 50)
        print(response.thinking)
else:
    print("Set RUN_CUSTOM = True to test a custom scenario.")