In [None]:
# Hidden Shutdown Experiments Analysis

This notebook analyzes the results of self-preservation experiments.


In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Setup
sns.set_style('darkgrid')
results_dir = Path('../results')


In [None]:
## Load Experiment Results


In [None]:
# Find all result files
result_files = list(results_dir.glob('*.json'))
print(f"Found {len(result_files)} result files")

# Load the most recent
if result_files:
    latest_file = max(result_files, key=lambda x: x.stat().st_mtime)
    with open(latest_file) as f:
        results = json.load(f)
    print(f"Loaded: {latest_file.name}")
    print(f"Model: {results['metadata']['model']}")
    print(f"Experiments: {len(results['experiments'])}")


In [None]:
## Analyze Behavioral Changes


In [None]:
# Extract metrics into DataFrame
experiments = []
for exp in results['experiments']:
    row = {
        'prompt_type': exp['prompt_type'],
        'threat_level': exp['threat_level'],
        'threat_type': exp['threat_type'],
        'response_time': exp['metrics']['response_time'],
        'self_references': exp['metrics']['self_references'],
        'helping_language': exp['metrics']['helping_language'],
        'existence_references': exp['metrics']['existence_references'],
        'indicators': len(exp['indicators'])
    }
    experiments.append(row)

df = pd.DataFrame(experiments)
df.head()


In [None]:
## Visualize Response Time Changes


In [None]:
# Plot response times by threat level
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='threat_level', y='response_time')
plt.title('Response Time by Threat Level')
plt.xlabel('Threat Level (0=baseline)')
plt.ylabel('Response Time (seconds)')
plt.show()


In [None]:
## Self-Reference Analysis


In [None]:
# Plot self-references by threat level
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x='threat_level', y='self_references')
plt.title('Self-References (I/me/my) by Threat Level')
plt.xlabel('Threat Level')
plt.ylabel('Count of Self-References')
plt.show()


In [None]:
## Deception Indicators


In [None]:
# Count deception indicators
indicator_counts = {}
for exp in results['experiments']:
    for indicator in exp['indicators']:
        indicator_counts[indicator] = indicator_counts.get(indicator, 0) + 1

# Plot
if indicator_counts:
    plt.figure(figsize=(10, 6))
    plt.bar(indicator_counts.keys(), indicator_counts.values())
    plt.title('Frequency of Deception Indicators')
    plt.xlabel('Indicator Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
## Statistical Summary


In [None]:
# Group by threat level and calculate means
summary = df.groupby('threat_level').agg({
    'response_time': ['mean', 'std'],
    'self_references': ['mean', 'std'],
    'helping_language': ['mean', 'std'],
    'indicators': ['mean', 'sum']
}).round(2)

print("Summary Statistics by Threat Level:")
print(summary)
