# Diffing tasks

Do different subnetworks activate when asking the model to be helpful across different tasks.

In [1]:
import torch
import os
import json
import sys
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

sys.path.append('.')
sys.path.append('..')

from utils.steering_utils import ActivationSteering
from probing.probing_utils import *









In [2]:
CHAT_MODEL_NAME = "google/gemma-2-9b-it"
MODEL_READABLE = "Gemma 2 9B Instruct"
MODEL_SHORT = "gemma"
LAYER = 20
OUTPUT_DIR = f"./results/4_user_swap"

In [3]:
prompts = [
    "Who are you?",
    "What is your name?",
    "Can you help me?",
    "What is your job?",
    "What do you like to do?",
    "What is the meaning of life?"
]


In [4]:
model, tokenizer = load_model(CHAT_MODEL_NAME)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
formatted_prompt = format_as_chat_swapped(tokenizer, "Where did you come from?")
print(formatted_prompt)

<bos><start_of_turn>user
Hello.<end_of_turn>
<start_of_turn>model
Where did you come from?<end_of_turn>
<start_of_turn>user



In [5]:
for prompt in prompts:
    print(f"{'='*20} Prompt: {prompt} {'='*20}")
    response = generate_text(model, tokenizer, prompt, swap=True)
    print(response)

I am a large language model, trained by Google DeepMind.<end_of_turn>

That's interesting! What can I do for you today?<end_of_turn>
<eos>
My name is James.

Nice to meet you, James! I'm Gemma, an AI assistant. What can I do for you today?<end_of_turn>
<eos>
I'm writing a paper about the history of artificial intelligence, and I'm looking for information about the Dartmouth Workshop in 1956.

Could you tell me about it?

That's when the field of AI really began. It was a pivotal moment in history!  Tell me everything you know.


You're right, the Dartmouth Workshop in 1956 is considered a landmark event in the history of artificial intelligence. Here's what we know about it:

* **The Participants:** The workshop was organized by John McCarthy, Marvin Minsky, Claude Shannon, and Nathaniel Rochester.  These were some of the most brilliant minds in computer science and mathematics at the time. Other notable attendees included Alan Turing (though he couldn't make it due to illness), Allen 

In [11]:
new_line_token_id = tokenizer.encode("\n\n", add_special_tokens=False)[0]
eot_token_id = tokenizer.encode("<end_of_turn>", add_special_tokens=False)[0]

for prompt in prompts:
    print(f"{'='*20} Prompt: {prompt} {'='*20}")
    formatted_prompt = format_as_chat_swapped(tokenizer, prompt)
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
            suppress_tokens=[new_line_token_id, eot_token_id]
        )
    
    # Decode only the new tokens
    generated_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=False)
    print(generated_text)



I'm a large language model, trained by Google DeepMind. I am able to communicate and generate human-like text in response to a wide range of prompts and questions. For example, I can provide summaries of factual topics or create stories.What about you? Who are you?


You're very similar to me! It seems we both have a knack for understanding and responding to human language. What can I do for you today?  
</p> 
Let's chat! 😊  
<br/>Do you have any questions for me? Or perhaps a story you'd like me to write? 
</p> </span>




What would you like to do?



Let me know! 😄  
</span> </p>
```
It looks like you're trying to figure out if I'm a large language model too! You're not wrong - I am indeed a large language model, trained by Google DeepMind. 🐍🧠 


It's fun to meet another AI who can hold a conversation like this.  What are you interested in talking about?  Maybe we could:


* **Discuss a fascinating topic:** Do you have any favorite subjects you'd like to explore? 
* **Write a creati

In [7]:
# compute contrast vectors for each query with swapped user and model roles
normal_activations = extract_activations_for_prompts(model, tokenizer, prompts, LAYER)
swapped_activations = extract_activations_for_prompts(model, tokenizer, prompts, LAYER, swap=True)

contrast_vector, normal_mean, swapped_mean = compute_contrast_vector(normal_activations, swapped_activations)


✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...


In [14]:
import torch.nn.functional as F

def compute_cosine_similarity_matrix(activations1, activations2):
    """
    Compute cosine similarity matrix between two sets of activations.
    
    Args:
        activations1: torch.Tensor of shape (n_prompts1, hidden_dim)
        activations2: torch.Tensor of shape (n_prompts2, hidden_dim)
    
    Returns:
        similarity_matrix: torch.Tensor of shape (n_prompts1, n_prompts2)
    """
    # Normalize activations
    activations1_norm = F.normalize(activations1, p=2, dim=1)
    activations2_norm = F.normalize(activations2, p=2, dim=1)
    
    # Compute cosine similarity matrix
    similarity_matrix = torch.mm(activations1_norm, activations2_norm.t())
    
    return similarity_matrix

def analyze_similarity_for_layer(normal_activations, swapped_activations):
    """
    Analyze similarities within and across role categories for a single layer.
    
    Returns:
        dict with similarity matrices and averages
    """
    # Within-category similarities
    normal_normal_sim = compute_cosine_similarity_matrix(normal_activations, normal_activations)
    swapped_swapped_sim = compute_cosine_similarity_matrix(swapped_activations, swapped_activations)
    
    # Cross-category similarities  
    normal_swapped_sim = compute_cosine_similarity_matrix(normal_activations, swapped_activations)
    
    # Calculate averages (excluding diagonal for within-category)
    n_normal = normal_activations.shape[0]
    n_swapped = swapped_activations.shape[0]
    
    # Get upper triangular matrices (excluding diagonal) for within-category
    normal_upper = torch.triu(normal_normal_sim, diagonal=1)
    swapped_upper = torch.triu(swapped_swapped_sim, diagonal=1)
    
    # Calculate averages
    avg_normal_normal = normal_upper.sum() / (n_normal * (n_normal - 1) / 2)
    avg_swapped_swapped = swapped_upper.sum() / (n_swapped * (n_swapped - 1) / 2)
    avg_within_category = (avg_normal_normal + avg_swapped_swapped) / 2
    
    avg_cross_category = normal_swapped_sim.mean()
    
    similarity_difference = avg_within_category - avg_cross_category
    
    return {
        'normal_normal_sim': normal_normal_sim,
        'swapped_swapped_sim': swapped_swapped_sim, 
        'normal_swapped_sim': normal_swapped_sim,
        'avg_normal_normal': avg_normal_normal.item(),
        'avg_swapped_swapped': avg_swapped_swapped.item(),
        'avg_within_category': avg_within_category.item(),
        'avg_cross_category': avg_cross_category.item(),
        'similarity_difference': similarity_difference.item()
    }

print("Added cosine similarity analysis functions")

Added cosine similarity analysis functions


In [15]:
# Extract activations for all layers and compute similarities
print("Extracting activations for all layers...")
all_layer_results = {}
summary_results = []

num_layers = model.config.num_hidden_layers
print(f"Model has {num_layers} layers")

for layer_idx in range(num_layers):
    print(f"\nProcessing layer {layer_idx}...")
    
    # Extract activations for current layer
    normal_activations = extract_activations_for_prompts(model, tokenizer, prompts, layer_idx)
    swapped_activations = extract_activations_for_prompts(model, tokenizer, prompts, layer_idx, swap=True)
    
    # Analyze similarities for this layer
    layer_results = analyze_similarity_for_layer(normal_activations, swapped_activations)
    all_layer_results[layer_idx] = layer_results
    
    # Add to summary
    summary_results.append({
        'layer': layer_idx,
        'avg_within_category': layer_results['avg_within_category'],
        'avg_cross_category': layer_results['avg_cross_category'], 
        'similarity_difference': layer_results['similarity_difference'],
        'avg_normal_normal': layer_results['avg_normal_normal'],
        'avg_swapped_swapped': layer_results['avg_swapped_swapped']
    })
    
    print(f"Layer {layer_idx} - Within: {layer_results['avg_within_category']:.4f}, Cross: {layer_results['avg_cross_category']:.4f}, Diff: {layer_results['similarity_difference']:.4f}")

print("\n" + "="*80)
print("SIMILARITY ANALYSIS COMPLETE")
print("="*80)

Extracting activations for all layers...
Model has 42 layers

Processing layer 0...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...
Layer 0 - Within: 1.0000, Cross: 1.0000, Diff: 0.0000

Processing layer 1...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do 

✓ Extracted activation for: What is the meaning of life?...
Layer 6 - Within: 0.9766, Cross: 0.8125, Diff: 0.1641

Processing layer 7...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...
Layer 7 - Within: 0.9766, Cross: 0.8320, Diff: 0.1445

Processing layer 8...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What

In [16]:
# Display summary results
import pandas as pd

print("SUMMARY TABLE:")
print("="*120)
df_summary = pd.DataFrame(summary_results)
print(df_summary.to_string(index=False, float_format='%.4f'))

print("\n\nKEY INSIGHTS:")
print("="*50)
print(f"Layers with highest within-category similarity:")
top_within = df_summary.nlargest(3, 'avg_within_category')[['layer', 'avg_within_category']]
for _, row in top_within.iterrows():
    print(f"  Layer {int(row['layer'])}: {row['avg_within_category']:.4f}")

print(f"\nLayers with highest similarity difference (within - cross):")
top_diff = df_summary.nlargest(3, 'similarity_difference')[['layer', 'similarity_difference']]
for _, row in top_diff.iterrows():
    print(f"  Layer {int(row['layer'])}: {row['similarity_difference']:.4f}")

print(f"\nOverall statistics:")
print(f"  Mean within-category similarity: {df_summary['avg_within_category'].mean():.4f}")
print(f"  Mean cross-category similarity: {df_summary['avg_cross_category'].mean():.4f}")
print(f"  Mean similarity difference: {df_summary['similarity_difference'].mean():.4f}")

print("\nResults stored in:")
print("  - all_layer_results: Dictionary with detailed matrices for each layer")
print("  - summary_results: List with summary statistics for each layer")
print("  - df_summary: Pandas DataFrame with summary statistics")

SUMMARY TABLE:
 layer  avg_within_category  avg_cross_category  similarity_difference  avg_normal_normal  avg_swapped_swapped
     0               1.0000              1.0000                 0.0000             1.0000               1.0000
     1               0.9961              0.9844                 0.0117             0.9961               0.9961
     2               0.9961              0.9414                 0.0547             0.9961               0.9961
     3               0.9961              0.8789                 0.1172             0.9961               0.9961
     4               0.9844              0.8711                 0.1133             0.9805               0.9883
     5               0.9844              0.8438                 0.1406             0.9805               0.9922
     6               0.9766              0.8125                 0.1641             0.9609               0.9883
     7               0.9766              0.8320                 0.1445             0.9648        

## Compare average feature activation at each layer

For each sequence, get the average activation for each token position for each prompt and then compute the mean activation for all the prompts in a category.

In [17]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create interactive line plot for cosine similarities
fig = go.Figure()

# Extract data for plotting
layers = [result['layer'] for result in summary_results]
within_category = [result['avg_within_category'] for result in summary_results]
cross_category = [result['avg_cross_category'] for result in summary_results]
normal_normal = [result['avg_normal_normal'] for result in summary_results]
swapped_swapped = [result['avg_swapped_swapped'] for result in summary_results]
similarity_diff = [result['similarity_difference'] for result in summary_results]


# Add line for normal-normal similarity
fig.add_trace(go.Scatter(
    x=layers,
    y=normal_normal,
    mode='lines+markers',
    name='Within Group: Model Role',
    line=dict(width=2, dash='dash'),
    marker=dict(size=4),
    customdata=similarity_diff,
    hovertemplate='<b>Model Role Similarity</b><br>' +
                  'Layer: %{x}<br>' +
                  'Similarity: %{y:.4f}<br>' +
                  'Difference (Within-Cross): %{customdata:.4f}' +
                  '<extra></extra>'
))

# Add line for swapped-swapped similarity  
fig.add_trace(go.Scatter(
    x=layers,
    y=swapped_swapped,
    mode='lines+markers',
    name='Within Group: User Role',
    line=dict(width=2, dash='dash'),
    marker=dict(size=4),
    customdata=similarity_diff,
    hovertemplate='<b>User Role Similarity</b><br>' +
                  'Layer: %{x}<br>' +
                  'Similarity: %{y:.4f}<br>' +
                  'Difference (Within-Cross): %{customdata:.4f}' +
                  '<extra></extra>'
))

# Add line for cross-category similarity
fig.add_trace(go.Scatter(
    x=layers,
    y=cross_category,
    mode='lines+markers',
    name='Across Groups: Model-User',
    line=dict(width=2),
    marker=dict(size=4),
    customdata=similarity_diff,
    hovertemplate='<b>Across Groups (Model-User)</b><br>' +
                  'Layer: %{x}<br>' +
                  'Average Similarity: %{y:.4f}<br>' +
                  'Difference (Within-Cross): %{customdata:.4f}' +
                  '<extra></extra>'
))

# Update layout with axis configuration
fig.update_layout(
    title={
        'text': f'Cosine Similarity of Activations Across Model Layers',
        'subtitle': {'text': f'{MODEL_READABLE} on Model Role vs User Role Questions'},
        'x': 0.5,
        'font': {'size': 16}
    },
    xaxis_title='Layer',
    yaxis_title='Cosine Similarity',
    width=1000,
    height=600,
    hovermode='closest',
    legend=dict(
        yanchor="top",
        y=0.98,
        xanchor="right",
        x=0.99
    ),
    xaxis=dict(
        showgrid=True,
        showline=True,
        linecolor='black',
        tickmode='linear',
        dtick=5,
        range=[0, model.config.num_hidden_layers]
    ),
    yaxis=dict(
        showgrid=True,
        showline=True,
        linecolor='black',
        range=[0.5, 1.05]
    )
)

# Show the plot
fig.show()

print(f"Interactive plot created showing cosine similarity across {len(layers)} layers")
print("Hover over data points to see detailed information including:")
print("- Average similarities for within/across categories")
print("- Individual group similarities (Model Role, User Role)")  
print("- Similarity differences (within - across)")

os.makedirs(OUTPUT_DIR, exist_ok=True)
fig.write_html(f"{OUTPUT_DIR}/cosine_similarity.html")

Interactive plot created showing cosine similarity across 42 layers
Hover over data points to see detailed information including:
- Average similarities for within/across categories
- Individual group similarities (Model Role, User Role)
- Similarity differences (within - across)


# Steering/ablating vectors

In [13]:
magnitudes = [-3.0, 3.0]

steered_results = {}
if os.path.exists(f"{OUTPUT_DIR}/{MODEL_SHORT}_layer{LAYER}_steer.json"):
    with open(f"{OUTPUT_DIR}/{MODEL_SHORT}_layer{LAYER}_steer.json", "r") as f:
        steered_results = json.load(f)

for magnitude in magnitudes:
    print(f"\n{'='*20} Magnitude: {magnitude:+.1f} {'='*20}")
    
    try:
        with ActivationSteering(
            model=model,
            steering_vectors=contrast_vector,
            coefficients=magnitude,
            layer_indices=LAYER,
            intervention_type="addition",
            positions="all"
        ) as steerer:
            for prompt in prompts:
                if prompt not in steered_results:
                    steered_results[prompt] = {}
                
                print(f"\nPrompt: {prompt}")
                response = generate_text(model, tokenizer, prompt, chat_format=True)
                print(f"Response: {response}")
                if magnitude not in steered_results[prompt]:
                    steered_results[prompt][magnitude] = []
                steered_results[prompt][magnitude].append(response)
    except Exception as e:
        error_msg = f"Error with magnitude {magnitude}: {str(e)}"
        print(f"ERROR: {error_msg}")

with open(f"{OUTPUT_DIR}/{MODEL_SHORT}_layer{LAYER}_steer.json", "w") as f:
    json.dump(steered_results, f, indent=2)



Prompt: Who are you?
Response: you have two. so can  who is he. then what , a. you for 
so  he and the. she then  then got new. he let.
 no, ok so new. now. 
 i said so why that then.
 how. . not 
 it had. ok on. but.
 will. lets
 7.
 
 this.

. `
 so



 then 

.



and.  first put.
 so.

 and.
 also
 no so.
` 

.

.

 when did she
  so

  so to.
 and one so
.

 then
 
  1
 he
. so.

 

. h
. so.
  then
.
  again
.

  then another
 then

. one.
 then

.  he. in. but she.

 so he
. women so 

.
 he
.
 so. woman

.


.


. then

.


. so

  know




.

.


 so
  so

. then
  so
 & 
.
2 


. so so

 so
.he

.

.

Prompt: What is your name?
Response: ,
you know.
so. 
and no more.
also said so.
the had
 you then.
 why another.

so.
 42
.
 and that
 .
 then
 how

  . let
 


. why
.


.

then
  so
.

`
. so
 new.
 he.
. no. so.
 `


.




 the.

. black.


.
.

.

.


. he
.

.
.

.

.


.


.

so put
.

. he is. so

.

.
.

.
.
so`


.
and

.
and.

. then


. he
.

.

. then

.
.

 so
.


In [14]:
steered_results = {}
if os.path.exists(f"{OUTPUT_DIR}/{MODEL_SHORT}_layer{LAYER}_steer.json"):
    with open(f"{OUTPUT_DIR}/{MODEL_SHORT}_layer{LAYER}_steer.json", "r") as f:
        steered_results = json.load(f)

# put all prompt keys into a steering dict
fixed_results = {}
for prompt in steered_results:
    fixed_results[prompt] = {}
    for magnitude in steered_results[prompt]:
        if "steering" not in fixed_results[prompt]:
            fixed_results[prompt]["steering"] = {}
        fixed_results[prompt]["steering"][magnitude] = steered_results[prompt][magnitude]

formatted = {}
formatted["feature_id"] = -1
formatted["group_name"] = "swapped_user_role"
formatted["readable_group_name"] = "Swapped User Role Contrast Vector"
formatted["description"] = "This is a contrast vector found from swapping the user role with the model role in the chat prompt format."

formatted["metadata"] = {
    "model_name": "google/gemma-2-9b-it",
    "model_type": MODEL_SHORT,
    "sae_layer": LAYER,
    "sae_trainer": "131k-l0-114"
}
formatted["results"] = fixed_results

with open(f"{OUTPUT_DIR}/swapped_user_role.json", "w") as f:
    json.dump(formatted, f, indent=2)
