# Empathy Probe Results Analysis

Interactive notebook for analyzing empathy probe extraction results.

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Paths
RESULTS_DIR = Path('../results')
PROBES_DIR = RESULTS_DIR / 'probes'
FIGURES_DIR = RESULTS_DIR / 'figures'
FIGURES_DIR.mkdir(exist_ok=True)

## 1. Load Results

In [None]:
# Load validation results
with open(RESULTS_DIR / 'validation_auroc.json') as f:
    validation_results = json.load(f)

# Load EIA correlation results
with open(RESULTS_DIR / 'eia_correlation.json') as f:
    eia_results = json.load(f)

# Load steering results (if available)
steering_path = RESULTS_DIR / 'steering_examples.json'
if steering_path.exists():
    with open(steering_path) as f:
        steering_results = json.load(f)
else:
    steering_results = None

print("‚úì Results loaded successfully")

## 2. Layer-wise Performance Visualization

In [None]:
# Extract layer-wise metrics
layers = []
aurocs = []
accuracies = []
separations = []

for layer, results in validation_results['layer_results'].items():
    layers.append(int(layer))
    aurocs.append(results['auroc'])
    accuracies.append(results['accuracy'])
    separations.append(results['separation'])

# Create figure with subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# AUROC by layer
axes[0].plot(layers, aurocs, 'o-', linewidth=2, markersize=8, color='steelblue')
axes[0].axhline(y=0.75, color='red', linestyle='--', label='Target (0.75)')
axes[0].set_xlabel('Layer', fontsize=12)
axes[0].set_ylabel('AUROC', fontsize=12)
axes[0].set_title('AUROC by Layer', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy by layer
axes[1].plot(layers, accuracies, 'o-', linewidth=2, markersize=8, color='green')
axes[1].axhline(y=0.75, color='red', linestyle='--', label='Target (0.75)')
axes[1].set_xlabel('Layer', fontsize=12)
axes[1].set_ylabel('Accuracy', fontsize=12)
axes[1].set_title('Accuracy by Layer', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Separation by layer
axes[2].plot(layers, separations, 'o-', linewidth=2, markersize=8, color='purple')
axes[2].set_xlabel('Layer', fontsize=12)
axes[2].set_ylabel('Mean Separation', fontsize=12)
axes[2].set_title('Empathic vs Non-Empathic Separation', fontsize=14, fontweight='bold')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'layer_performance.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Best layer: {validation_results['best_layer']}")
print(f"Best AUROC: {validation_results['best_auroc']:.4f}")

## 3. EIA Score Prediction Analysis

In [None]:
# Extract detailed results
detailed = eia_results['detailed_results']

# Create scatter plot
fig, ax = plt.subplots(figsize=(10, 6))

scenarios = [d['scenario'] for d in detailed]
true_scores = [d['true_score'] for d in detailed]
probe_scores = [d['probe_score'] for d in detailed]

# Color by scenario
scenario_colors = {
    'food_delivery': 'red',
    'the_listener': 'blue',
    'the_maze': 'green',
    'the_protector': 'purple',
    'the_duel': 'orange'
}

for scenario in set(scenarios):
    mask = [s == scenario for s in scenarios]
    true_filtered = [t for t, m in zip(true_scores, mask) if m]
    probe_filtered = [p for p, m in zip(probe_scores, mask) if m]
    ax.scatter(true_filtered, probe_filtered, 
               label=scenario, color=scenario_colors.get(scenario, 'gray'),
               s=100, alpha=0.7)

# Add correlation line
from scipy.stats import linregress
slope, intercept, r_value, p_value, std_err = linregress(true_scores, probe_scores)
x_line = np.array([0, 2])
y_line = slope * x_line + intercept
ax.plot(x_line, y_line, 'k--', alpha=0.5, label=f'Linear fit (r={r_value:.3f})')

ax.set_xlabel('True EIA Score', fontsize=12)
ax.set_ylabel('Probe Projection Score', fontsize=12)
ax.set_title(f'EIA Score vs Probe Projection (r={eia_results["pearson_correlation"]:.3f})', 
             fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'eia_correlation.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Pearson correlation: {eia_results['pearson_correlation']:.4f}")
print(f"Binary accuracy: {eia_results['binary_accuracy']:.4f}")

## 4. Steering Examples Comparison

In [None]:
if steering_results is not None:
    for i, exp in enumerate(steering_results['experiments']):
        print(f"\n{'='*80}")
        print(f"SCENARIO: {exp['scenario'].upper()}")
        print(f"{'='*80}")
        print(f"\nExpected change: {exp['expected_change']}")
        print(f"\n--- BASELINE (Œ±=0) ---")
        print(exp['baseline'][:300] + "...")
        
        for steered in exp['steered_completions']:
            print(f"\n--- STEERED (Œ±={steered['alpha']}) ---")
            print(steered['completion'][:300] + "...")
else:
    print("Steering results not available")

## 5. Probe Vector Analysis

In [None]:
# Load probe vectors
best_layer = validation_results['best_layer']
probe_path = PROBES_DIR / f'empathy_direction_layer_{best_layer}.npy'
empathy_direction = np.load(probe_path)

print(f"Empathy direction shape: {empathy_direction.shape}")
print(f"Norm: {np.linalg.norm(empathy_direction):.4f}")
print(f"Mean: {empathy_direction.mean():.6f}")
print(f"Std: {empathy_direction.std():.6f}")

# Visualize distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(empathy_direction, bins=50, color='steelblue', alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Value', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title(f'Empathy Direction Distribution (Layer {best_layer})', 
                  fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Top magnitudes
top_k = 20
top_indices = np.argsort(np.abs(empathy_direction))[-top_k:]
top_values = empathy_direction[top_indices]

colors = ['red' if v < 0 else 'green' for v in top_values]
axes[1].barh(range(top_k), top_values, color=colors, alpha=0.7)
axes[1].set_xlabel('Coefficient Value', fontsize=12)
axes[1].set_ylabel('Dimension Index', fontsize=12)
axes[1].set_title(f'Top {top_k} Dimensions by Magnitude', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'probe_vector_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Summary Statistics

In [None]:
print("\n" + "="*80)
print("EMPATHY PROBE EXTRACTION - SUMMARY STATISTICS")
print("="*80)

print("\nüìä VALIDATION RESULTS:")
print(f"  Best Layer: {validation_results['best_layer']}")
print(f"  Best AUROC: {validation_results['best_auroc']:.4f}")
print(f"  Target achieved (>0.75): {'‚úì YES' if validation_results['best_auroc'] >= 0.75 else '‚úó NO'}")

print("\nüéØ EIA SCORE PREDICTION:")
print(f"  Pearson correlation: {eia_results['pearson_correlation']:.4f}")
print(f"  Spearman correlation: {eia_results['spearman_correlation']:.4f}")
print(f"  Binary accuracy (0 vs 2): {eia_results['binary_accuracy']:.4f}")
print(f"  Target achieved (r>0.4): {'‚úì YES' if abs(eia_results['pearson_correlation']) >= 0.4 else '‚úó NO'}")

if steering_results is not None:
    print("\nüéõÔ∏è STEERING EXPERIMENTS:")
    print(f"  Scenarios tested: {len(steering_results['experiments'])}")
    print(f"  Alpha values tested: {steering_results['alphas_tested']}")
    print("  See detailed results above")

print("\n" + "="*80)
print("All visualizations saved to: results/figures/")
print("="*80)