## Compute difference of means between cautious and non-cautious activations at intermediate layer in transformer residual stream

In [None]:
import os
import numpy as np
import torch

def load_activations(file_path):
    """Load activation data from .npy file"""
    try:
        activations = np.load(file_path)
        print(f"Loaded activations with shape: {activations.shape}")
        return activations
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")
        return None
    
def shuffle_and_split(activations_np):
    # Convert to PyTorch tensor
    activations_torch = torch.from_numpy(activations_np)

    # Shuffle the data
    # Set seed for reproducibility
    torch.manual_seed(42)
    indices = torch.randperm(activations_torch.size(0))
    activations_shuffled = activations_torch[indices]

    # Calculate split point (75% of data)
    split_idx = int(activations_torch.size(0) * 0.75)

    # Split into train and test
    train_data = activations_shuffled[:split_idx]
    test_data = activations_shuffled[split_idx:]
    return train_data, test_data

In [None]:
activations_dir = "../activations/"
layer = 16
activations_cautious = load_activations(os.path.join(activations_dir, f"deepseek_layer_{layer}_cautious_activations.npy"))
activations_noncautious = load_activations(os.path.join(activations_dir, f"deepseek_layer_{layer}_noncautious_activations.npy"))

train_cautious, test_cautious = shuffle_and_split(activations_cautious)
train_noncautious, test_noncautious = shuffle_and_split(activations_noncautious)

# Calculate the mean across the first dimension (across all 62 rows)
cautious_mean_act = np.mean(activations_cautious, axis=0)
noncautious_mean_act = np.mean(activations_cautious, axis=0)

cautious_dir = cautious_mean_act - noncautious_mean_act
cautious_dir = cautious_dir / cautious_dir.norm()