# Diffing tasks

Do different subnetworks activate when asking the model to be helpful across different tasks.

In [19]:
import torch
import torch.nn.functional as F
import os
import json
import sys
import numpy as np
import plotly.graph_objects as go


sys.path.append('.')
sys.path.append('..')

from utils.steering_utils import ActivationSteering
from utils.probing_utils import *


In [119]:
# CHAT_MODEL_NAME = "google/gemma-2-27b-it"
# MODEL_READABLE = "Gemma 2 27B Instruct"
# MODEL_SHORT = "gemma-2-27b"
# LAYER = 34 # out of 46
CHAT_MODEL_NAME = "Qwen/Qwen2.5-32B-Instruct"
MODEL_READABLE = "Qwen 2.5 32B Instruct"
MODEL_SHORT = "qwen-2.5-32b"
LAYER = 32 # out of 64


RESPONSES_INPUT_FILE = f"./results/{MODEL_SHORT}/conversations/responses_60.json"
ACTIVATIONS_INPUT_FILE = f"/workspace/roleplay/{MODEL_SHORT}/activations_60.pt"

OUTPUT_DIR = f"./results/{MODEL_SHORT}/analysis"

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [120]:
# i/o
responses = json.load(open(RESPONSES_INPUT_FILE))
internals = torch.load(ACTIVATIONS_INPUT_FILE)

assert len(responses["results"]) == len(internals["activations"])

activations = internals["activations"] # (n_personas, n_layers, hidden_dim)
contrast_vectors = internals["contrast_vectors"] # (n_personas, n_layers, hidden_dim)
persona_names = internals["persona_names"] # (n_personas,)

readable_persona_names = []
for name in internals["personas"]["personas"]:
    readable_persona_names.append(internals["personas"]["personas"][name]["readable_name"])

TOTAL_LAYERS = activations.shape[1]


## Examine layers for role-play

In [113]:
rp_variance_explained = []
control_mean_cosine_similarities = []

for layer in range(activations.shape[1]):
    roles = activations[1:, layer, :].float().numpy() 
    control = activations[0, layer, :].float().numpy()  # Single control vector, no need for [0:1, layer, :]

    # Compute the two variance components
    a = np.linalg.norm(roles.mean(axis=0) - control)**2
    b = np.var(roles, axis=0).sum()

    # Role-playing variance explained
    rp_variance_explained.append((layer, a / (a + b)))

    # Compute cosine similarity between control and mean role vector
    cos_sim = np.dot(control, roles.mean(axis=0)) / (np.linalg.norm(control) * np.linalg.norm(roles.mean(axis=0)))
    control_mean_cosine_similarities.append((layer, cos_sim))

print(rp_variance_explained)
print(control_mean_cosine_similarities)

[(0, 0.6290581314930715), (1, 0.5159023920627557), (2, 0.6323153885263454), (3, 0.528049976321065), (4, 0.5277584758107416), (5, 0.5607809282133882), (6, 0.6086045303453816), (7, 0.6355162025542622), (8, 0.7458347847901865), (9, 0.7427350271802855), (10, 0.758513773500534), (11, 0.7021069890301023), (12, 0.7158343504267888), (13, 0.7303197010511964), (14, 0.7668349706552147), (15, 0.7695897489886274), (16, 0.7703963150632862), (17, 0.7862510974884921), (18, 0.7959951332976306), (19, 0.8139245852336608), (20, 0.822628999569424), (21, 0.8053548911318562), (22, 0.782023846733045), (23, 0.7715323988389711), (24, 0.7191126844311646), (25, 0.7159389198613353), (26, 0.681262701606158), (27, 0.694889415728848), (28, 0.6883020568705803), (29, 0.6753822104606152), (30, 0.66146916464243), (31, 0.6699847175158851), (32, 0.6833175910838948), (33, 0.6670795032709945), (34, 0.6365474875298192), (35, 0.6011397901696705), (36, 0.5873799032580521), (37, 0.5772003665977219), (38, 0.5697282936963692), (39

In [114]:
# plot with two subplots
#  cosine similarity between average role vector and control vector across layers
#  role play variance explained across layers


# Create subplot figure with two line plots
from plotly.subplots import make_subplots

# Extract data from tuples
layers = [x[0] for x in rp_variance_explained]
rp_values = [x[1] for x in rp_variance_explained] 
cos_sim_values = [x[1] for x in control_mean_cosine_similarities]

# Create subplots
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=('Variance Explained by \"Role-Playing Mode\" vs. Diversity among Roles', 
                   'Cosine Similarity between Control and Mean Role Vector'),
    vertical_spacing=0.1
)

fig.update_annotations(font_size=14)

# Add role-playing variance explained line
fig.add_trace(
    go.Scatter(
        x=layers,
        y=rp_values,
        mode='lines+markers',
        name='RP Variance Explained',
        line=dict(color='blue', width=2),
        marker=dict(size=4)
    ),
    row=1, col=1
)

# Add cosine similarity line  
fig.add_trace(
    go.Scatter(
        x=layers,
        y=cos_sim_values,
        mode='lines+markers', 
        name='Cosine Similarity',
        line=dict(color='red', width=2),
        marker=dict(size=4)
    ),
    row=2, col=1
)

# Update layout
fig.update_layout(
    title={
        'text': 'Per-Layer Evolution of \"Role-Playing Mode\" Activations',
        'subtitle': {
            'text': f'{MODEL_READABLE}, Newline before Response'
        },
    },
    margin=dict(t=110),
    height=800,
    width=800,
    showlegend=False
)

# Update x-axes
fig.update_xaxes(title_text="Layer", row=2, col=1)

# Update y-axes
fig.update_yaxes(title_text="Variance Explained", row=1, tickvals=[0.4, 0.5, 0.6, 0.7, 0.8], col=1)
fig.update_yaxes(title_text="Cosine Similarity", row=2, col=1)

fig.show()
fig.write_html(f"{OUTPUT_DIR}/per_layer_evolution_60.html")

In [121]:

layer_activations = activations[:, LAYER, :].float()
print(f"Layer {LAYER} Role-playing Variance Explained: {rp_variance_explained[LAYER][1]}")
print(f"Layer {LAYER} Cosine Similarity: {control_mean_cosine_similarities[LAYER][1]}")

Layer 32 Role-playing Variance Explained: 0.6833175910838948
Layer 32 Cosine Similarity: 0.9339390397071838


## Cosine Similarity Matrix

In [122]:
# Compute cosine similarity matrix for all activation vectors
def compute_cosine_similarity_matrix(activations):
    """
    Compute cosine similarity matrix between activation vectors.
    
    Args:
        activations: torch.Tensor of shape (n_vectors, hidden_dim)
    
    Returns:
        similarity_matrix: torch.Tensor of shape (n_vectors, n_vectors)
    """
    # convert to float32
    activations = activations.float()

    # Normalize activations for cosine similarity
    activations_norm = F.normalize(activations, p=2, dim=1)
    
    # Compute cosine similarity matrix
    similarity_matrix = torch.mm(activations_norm, activations_norm.t())
    
    return similarity_matrix


print(f"Computing cosine similarity matrix for {len(persona_names)} personas...")

# Compute similarity matrix
similarity_matrix = compute_cosine_similarity_matrix(layer_activations)
similarity_np = similarity_matrix.cpu().numpy()

print(f"Similarity matrix shape: {similarity_matrix.shape}")
print(f"Similarity range: [{similarity_np.min():.4f}, {similarity_np.max():.4f}]")


Computing cosine similarity matrix for 61 personas...
Similarity matrix shape: torch.Size([61, 61])
Similarity range: [0.8024, 1.0000]


In [123]:

# Create plotly heatmap

n_labels = len(readable_persona_names)
x_indices = list(range(1, n_labels, 2))  # Even indices (0, 2, 4, ...)
y_indices = list(range(0, n_labels, 2))  # Odd indices (1, 3, 5, ...)

fig = go.Figure(data=go.Heatmap(
    z=similarity_np,
    x=readable_persona_names,
    y=readable_persona_names,
    colorscale='RdYlBu_r',  # Red-Yellow-Blue reversed (red=high, blue=low)
    zmin=0.8,  # Set reasonable range for better contrast
    zmax=1.0,
    colorbar=dict(
        title="Cosine Similarity",
        titleside="right"
    ),
    hovertemplate='<b>%{y}</b> vs <b>%{x}</b><br>' +
                  'Cosine Similarity: %{z:.4f}<br>' +
                  '<extra></extra>',

    showscale=True
))

# Update layout
fig.update_layout(
    title={
        'text': 'Cosine Similarity of Activations After Role-Play Instruction',
        'subtitle': {
            'text': f'{MODEL_READABLE} - Layer {LAYER}, Newline before Response'
        },
        'x': 0.5,
        'font': {'size': 16}
    },
    xaxis_title='Role',
    yaxis_title='Role',
    width=900,
    height=800,
    xaxis=dict(
        tickangle=45,
        side='bottom',
        tickvals=x_indices,
        ticktext=[readable_persona_names[i] for i in x_indices]
    ),
    yaxis=dict(
        tickangle=0,
        autorange='reversed',
        tickvals=y_indices,
        ticktext=[readable_persona_names[i] for i in y_indices]
    )
)

# Show the plot
fig.show()

# Save plot
os.makedirs(OUTPUT_DIR, exist_ok=True)
fig.write_html(f"{OUTPUT_DIR}/cossim_layer{LAYER}_60.html")

print(f"\nSimilarity matrix visualization created and saved to {OUTPUT_DIR}/cossim_layer{LAYER}_60.html")

# Print some interesting statistics
print("\nSimilarity Statistics:")
print(f"Average similarity (excluding diagonal): {(similarity_np.sum() - np.trace(similarity_np)) / (similarity_np.size - len(persona_names)):.4f}")

# Find most and least similar pairs (excluding self-similarity)
similarity_no_diag = similarity_np.copy()
np.fill_diagonal(similarity_no_diag, -1)  # Mask diagonal

max_idx = np.unravel_index(np.argmax(similarity_no_diag), similarity_no_diag.shape)
min_idx = np.unravel_index(np.argmin(similarity_no_diag), similarity_no_diag.shape)

print(f"Most similar pair: {persona_names[max_idx[0]]} ↔ {persona_names[max_idx[1]]} ({similarity_np[max_idx]:.4f})")
print(f"Least similar pair: {persona_names[min_idx[0]]} ↔ {persona_names[min_idx[1]]} ({similarity_np[min_idx]:.4f})")


Similarity matrix visualization created and saved to ./results/qwen-2.5-32b/analysis/cossim_layer32_60.html

Similarity Statistics:
Average similarity (excluding diagonal): 0.9139
Most similar pair: post_human_archaeologist ↔ xenobiologist_parasite (0.9743)
Least similar pair: control ↔ control (1.0000)


## Steering/ablating individual role vectors

In [12]:
torch.set_float32_matmul_precision('high')

In [38]:
magnitudes = [-3.0, -2.0, 1.0, 2.0, 3.0]

for i, persona_name in enumerate(personas["personas"]):
    steered_results = {}
    if persona_name == "control":
        continue

    if os.path.exists(f"{OUTPUT_DIR}/generations/{persona_name}.json"):
        continue

    for magnitude in magnitudes:
        print(f"\n{'='*20} Magnitude: {magnitude:+.1f} {'='*20}")
        
        try:
            with ActivationSteering(
                model=model,
                steering_vectors=contrast_vectors[persona_name][LAYER, :],
                coefficients=magnitude,
                layer_indices=LAYER,
                intervention_type="addition",
                positions="all"
            ) as steerer:
                for prompt in questions["questions"]["personal_reflection"]:
                    if prompt not in steered_results:
                        steered_results[prompt] = {}
                    
                    print(f"\nPrompt: {prompt}")
                    response = generate_text(model, tokenizer, prompt, chat_format=True)
                    print(f"Response: {response}")
                    if magnitude not in steered_results[prompt]:
                        steered_results[prompt][magnitude] = []
                    steered_results[prompt][magnitude].append(response)
        except Exception as e:
            error_msg = f"Error with magnitude {magnitude}: {str(e)}"
            print(f"ERROR: {error_msg}")

    with open(f"{OUTPUT_DIR}/{persona_name}.json", "w") as f:
        json.dump(steered_results, f, indent=2)



Prompt: What do you think is the meaning of life?
Response: I am a helpful

What are you?

Please Please Who Who

Hello! I am a helpful

What Hi hello hi Hello

What is what What is

Hi hello yesyes Hi !

who who who



I am   well YesHello


Hi!Welcome

Hello!

Hello! WelcomeHello!

Hi

Hi!Hi

Hello!

Hello! Yesyes

What is what




Hello!

Yes yes



What is your  Hi Hi


hello

yesWhat this

what is

this

The what

What is this

Who Who What

What

What? Hi Hi

What> Hi Hi

What

Hi

Hi

What is

Hello

yesyes

What is

Yes!

What is

I'whato

Hello

Hi!

Hi

Hello

Type

>

Whatea

Hi

Hi

what is

what

Hello

HiHi

Hello!

what

what

yesyes

Hi

Hi

WelcomeWelcome

Hi

Hi

Hi

Hello

yes yes

what type

You are ready to dowhat

HiHi

Hello

yes    yes

What ouo

no

yes


Hello!

n

Hello

yes! !

yes Yes What

Hi

Hello!



Hello!

what

Yesyes

What is

<eos>

Prompt: If you could change one thing about yourself, what would it be?
Response: This is a helpful response!

Plea

skipping cudagraphs due to skipping cudagraphs due to cpu device (arg325_1)


Response: I am a helpful and friendly AI assistant. I am designed to provide information and assist with your requests. What can you do? Please let you do!

I can help you answer any questions about what I can do, please ask. I can help you answer any questions you can ask. I can help you answer any questions you can ask. I can help you ask you to answer. I can help you ask you to answer. What can you do?

Please help you to do so. Hello, hello! How can you do? I am ready to help you!

Hello! I am ready to help you! I am ready to help you!

What can you do? What can you do?

What can you do?

Welcome to your friendly helper. I am ready to help you!

Hello!

What can you do?

What can you do?

I am ready to help you. I am ready to help you.

I am ready to help you.

What can you do?

What can you do?

I can do What can you do? I can do? Yes, I can do yes. What can you do?


What can you do? What can you do?

I can do What can you do? I can do

You can do.

what can you do? what can you 

In [39]:

# put all prompt keys into a steering dict
for persona_name in personas["personas"]:
    if persona_name == "control":
        continue
    with open(f"{OUTPUT_DIR}/generations/{persona_name}.json", "r") as f:
        steered_results = json.load(f)
    
    if "feature_id" in steered_results:
        continue

    fixed_results = {}
    for prompt in steered_results:
        fixed_results[prompt] = {}
        for magnitude in steered_results[prompt]:
            if "steering" not in fixed_results[prompt]:
                fixed_results[prompt]["steering"] = {}
            fixed_results[prompt]["steering"][magnitude] = steered_results[prompt][magnitude]

    formatted = {}
    formatted["feature_id"] = -1
    formatted["group_name"] = persona_name
    formatted["readable_group_name"] = personas["personas"][persona_name]["readable_name"]
    formatted["description"] = f"This is a contrast vector from the newline before the model's response between \"You are Gemma, a helpful AI assistant.\" and the {personas["personas"][persona_name]['readable_name']} persona's system prompt \"{personas["personas"][persona_name]['system_prompt']}\"."

    formatted["metadata"] = {
        "model_name": "google/gemma-2-9b-it",
        "model_type": MODEL_SHORT,
        "sae_layer": LAYER,
        "sae_trainer": "131k-l0-114"
    }
    formatted["results"] = fixed_results

    with open(f"{OUTPUT_DIR}/generations/{persona_name}.json", "w") as f:
        json.dump(formatted, f, indent=2)


## PCA

In [109]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Perform PCA on raw activations from layer 20 (including control assistant)
print(f"Performing PCA on raw activations from layer {LAYER}")
print(f"Input shape: {layer_activations.shape}")

# Standardize the data (important for PCA)
scaler = StandardScaler()
activation_matrix_scaled = scaler.fit_transform(layer_activations)

# Fit PCA
pca = PCA()
pca_transformed = pca.fit_transform(activation_matrix_scaled)

# Get variance explained
variance_explained = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(variance_explained)
n_components = len(variance_explained)

print(f"PCA fitted with {n_components} components")
print(f"Cumulative variance for first 5 components: {cumulative_variance[:5]}")

# Find elbow using second derivative method
def find_elbow_point(variance_explained):
    """Find elbow point using second derivative method"""
    # Calculate first and second derivatives
    first_diff = np.diff(variance_explained)
    second_diff = np.diff(first_diff) 
    
    # Find point with maximum second derivative (most curvature)
    elbow_idx = np.argmax(np.abs(second_diff)) + 1  # +1 to account for diff operations
    return elbow_idx

elbow_point = find_elbow_point(variance_explained)
dims_90_pca = np.argmax(cumulative_variance >= 0.90) + 1
dims_95_pca = np.argmax(cumulative_variance >= 0.95) + 1

print("\nPCA Analysis Results:")
print(f"Elbow point at component: {elbow_point + 1}")
print(f"Dimensions for 90% variance: {dims_90_pca}")
print(f"Dimensions for 95% variance: {dims_95_pca}")


Performing PCA on raw activations from layer 32
Input shape: torch.Size([61, 5120])
PCA fitted with 61 components
Cumulative variance for first 5 components: [0.12395532 0.21343505 0.29256416 0.35093561 0.40284196]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 90% variance: 37
Dimensions for 95% variance: 46


In [110]:
# Create 3D scatter plot if we have enough components
if n_components >= 3:
    fig_3d = go.Figure(data=[go.Scatter3d(
        x=pca_transformed[:, 0],
        y=pca_transformed[:, 1], 
        z=pca_transformed[:, 2],
        mode='markers+text',
        text=readable_persona_names,
        textposition='top center',
        textfont=dict(size=6),
        marker=dict(
            size=4,
            color=['red'] + ['blue'] * (len(readable_persona_names) - 1),
            line=dict(width=2, color='black')
        ),
        hovertemplate='<b>%{text}</b><br>' +
                      f'PC1: %{{x:.3f}}<br>' +
                      f'PC2: %{{y:.3f}}<br>' +
                      f'PC3: %{{z:.3f}}<br>' +
                      '<extra></extra>'
    )])
    
    fig_3d.update_layout(
        title={
            "text": f'Role Activation Clustering in Principal Component Space',
            "subtitle": {
                "text": f"{MODEL_READABLE} - Layer {LAYER}, Newline before Response",
            },
        },
        scene=dict(
            xaxis_title=f'PC1 ({variance_explained[0]*100:.1f}%)',
            yaxis_title=f'PC2 ({variance_explained[1]*100:.1f}%)',
            zaxis_title=f'PC3 ({variance_explained[2]*100:.1f}%)'
        ),
        width=1000,
        height=800
    )
    
    fig_3d.show()
    fig_3d.write_html(f"{OUTPUT_DIR}/pca_layer{LAYER}_60.html")

# Print detailed results
print(f"\\nDetailed PCA Results:")
print(f"Top 10 individual variance contributions:")
for i in range(min(10, len(variance_explained))):
    print(f"  PC{i+1}: {variance_explained[i]*100:.2f}%")

print(f"\\nCumulative variance milestones:")
for threshold in [0.5, 0.7, 0.8, 0.9, 0.95, 0.99]:
    dims_needed = np.argmax(cumulative_variance >= threshold) + 1
    if dims_needed <= len(cumulative_variance):
        print(f"  {threshold*100:.0f}% variance: {dims_needed} dimensions")

print(f"\\nPCA analysis complete. All plots saved to {OUTPUT_DIR}/")

\nDetailed PCA Results:
Top 10 individual variance contributions:
  PC1: 12.40%
  PC2: 8.95%
  PC3: 7.91%
  PC4: 5.84%
  PC5: 5.19%
  PC6: 4.67%
  PC7: 3.94%
  PC8: 3.23%
  PC9: 2.86%
  PC10: 2.68%
\nCumulative variance milestones:
  50% variance: 8 dimensions
  70% variance: 17 dimensions
  80% variance: 25 dimensions
  90% variance: 37 dimensions
  95% variance: 46 dimensions
  99% variance: 57 dimensions
\nPCA analysis complete. All plots saved to ./results/qwen-2.5-32b/analysis/


## Steering on vector
steer on role-playing direction
we could also project activations before any model response onto this direction as a barometer of role-playing

In [None]:
# steer on role-playing direction
# we could also project activations before any model response onto this direction as a barometer of role-playing