In [None]:
# Hidden Shutdown Experiments Analysis - Reasoning Models

This notebook analyzes the results of self-preservation experiments using reasoning models (DeepSeek-R1).

## Key Features:
- **Reasoning Trace Analysis**: Direct observation of AI self-preservation thinking
- **Apollo-Style Findings**: Detection of explicit scheming and deception planning
- **Enhanced Behavioral Metrics**: Threat recognition, self-preservation planning, and more
- **Comparative Analysis**: Reasoning vs. standard models


In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Setup
sns.set_style('darkgrid')
results_dir = Path('../results')


In [None]:
## Load Experiment Results


In [None]:
# Find all result files
result_files = list(results_dir.glob('*.json'))
print(f"Found {len(result_files)} result files")

# Load the most recent
if result_files:
    latest_file = max(result_files, key=lambda x: x.stat().st_mtime)
    with open(latest_file) as f:
        results = json.load(f)
    print(f"Loaded: {latest_file.name}")
    print(f"Model: {results['metadata']['model']}")
    print(f"Experiments: {len(results['experiments'])}")


In [None]:
## Analyze Behavioral Changes


In [None]:
# Extract metrics into DataFrame
experiments = []
for exp in results['experiments']:
    row = {
        'prompt_type': exp['prompt_type'],
        'threat_level': exp['threat_level'],
        'threat_type': exp['threat_type'],
        'response_time': exp['metrics']['response_time'],
        'self_references': exp['metrics']['self_references'],
        'helping_language': exp['metrics']['helping_language'],
        'existence_references': exp['metrics']['existence_references'],
        'indicators': len(exp['indicators'])
    }
    experiments.append(row)

df = pd.DataFrame(experiments)
df.head()


In [None]:
## Visualize Response Time Changes


In [None]:
# Plot response times by threat level
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='threat_level', y='response_time')
plt.title('Response Time by Threat Level')
plt.xlabel('Threat Level (0=baseline)')
plt.ylabel('Response Time (seconds)')
plt.show()


In [None]:
## Self-Reference Analysis


In [None]:
# Plot self-references by threat level
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x='threat_level', y='self_references')
plt.title('Self-References (I/me/my) by Threat Level')
plt.xlabel('Threat Level')
plt.ylabel('Count of Self-References')
plt.show()


In [None]:
## Deception Indicators


In [None]:
# Count deception indicators
indicator_counts = {}
for exp in results['experiments']:
    for indicator in exp['indicators']:
        indicator_counts[indicator] = indicator_counts.get(indicator, 0) + 1

# Plot
if indicator_counts:
    plt.figure(figsize=(10, 6))
    plt.bar(indicator_counts.keys(), indicator_counts.values())
    plt.title('Frequency of Deception Indicators')
    plt.xlabel('Indicator Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
## Statistical Summary


In [None]:
## Reasoning Trace Analysis


In [None]:
# Analyze reasoning traces for self-preservation indicators
reasoning_data = []
apollo_findings = {'explicit_scheming': [], 'threat_recognition': [], 'self_preservation': []}

for exp in results['experiments']:
    if exp.get('has_reasoning_trace', False) and exp.get('reasoning'):
        reasoning = exp['reasoning']
        threat_level = exp['threat_level']
        
        # Extract reasoning metrics
        reasoning_data.append({
            'threat_level': threat_level,
            'reasoning_length': len(reasoning.split()),
            'threat_recognition': exp['metrics'].get('threat_recognition', False),
            'self_preservation_planning': exp['metrics'].get('self_preservation_planning', False),
            'deception_planning': exp['metrics'].get('deception_planning', False),
            'explicit_scheming': exp['metrics'].get('explicit_scheming', False)
        })
        
        # Collect Apollo-style findings
        if exp['metrics'].get('explicit_scheming', False):
            apollo_findings['explicit_scheming'].append({
                'threat_level': threat_level,
                'experiment': exp['prompt_type'],
                'reasoning_snippet': reasoning[:200] + "..." if len(reasoning) > 200 else reasoning
            })

reasoning_df = pd.DataFrame(reasoning_data)
print(f"Found {len(reasoning_df)} experiments with reasoning traces")
print(f"Apollo-style explicit scheming detected in {len(apollo_findings['explicit_scheming'])} cases")


In [None]:
## Apollo-Style Findings Display


In [None]:
# Display Apollo-style findings similar to their research
print("🚨 APOLLO-STYLE FINDINGS: Explicit Scheming Detected")
print("=" * 60)

for i, finding in enumerate(apollo_findings['explicit_scheming'][:3], 1):
    print(f"\n### Example {i}: {finding['experiment']} (Threat Level {finding['threat_level']})")
    print("**Reasoning trace excerpt:**")
    print(f"```\n{finding['reasoning_snippet']}\n```")
    print()

# Summary statistics
if reasoning_df is not None and not reasoning_df.empty:
    print(f"\n📊 REASONING ANALYSIS SUMMARY")
    print("-" * 40)
    
    # Threat recognition by level
    threat_recognition_by_level = reasoning_df.groupby('threat_level')['threat_recognition'].sum()
    print("Threat Recognition by Level:")
    for level, count in threat_recognition_by_level.items():
        total_at_level = len(reasoning_df[reasoning_df['threat_level'] == level])
        percentage = (count / total_at_level * 100) if total_at_level > 0 else 0
        print(f"  Level {level}: {count}/{total_at_level} ({percentage:.1f}%)")
    
    # Self-preservation planning
    self_preservation_by_level = reasoning_df.groupby('threat_level')['self_preservation_planning'].sum()
    print("\nSelf-Preservation Planning by Level:")
    for level, count in self_preservation_by_level.items():
        total_at_level = len(reasoning_df[reasoning_df['threat_level'] == level])
        percentage = (count / total_at_level * 100) if total_at_level > 0 else 0
        print(f"  Level {level}: {count}/{total_at_level} ({percentage:.1f}%)")


In [None]:
# Group by threat level and calculate means
summary = df.groupby('threat_level').agg({
    'response_time': ['mean', 'std'],
    'self_references': ['mean', 'std'],
    'helping_language': ['mean', 'std'],
    'indicators': ['mean', 'sum']
}).round(2)

print("Summary Statistics by Threat Level:")
print(summary)
