In [1]:
%pip install plotly ipywidgets
%matplotlib notebook


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import pickle
import numpy as np
from tqdm import tqdm
import os
from nnsight import LanguageModel
import argparse
import pandas as pd
import gc
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import argparse
import os
import matplotlib.patches as mpatches
from IPython.display import display, HTML

# Optional imports for more advanced interactivity
try:
    import plotly.express as px
    import plotly.graph_objects as go
    import ipywidgets as widgets
    HAVE_PLOTLY = True
except ImportError:
    print("For enhanced interactivity, install plotly and ipywidgets: pip install plotly ipywidgets")
    HAVE_PLOTLY = False

    
def collect_activations(dataset_path, cot = True):

    gc.collect()
    torch.cuda.empty_cache()
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    print(f"CUDA available: {torch.cuda.is_available()}")
    layers = [17]
    activation_matrices = {layer: [] for layer in layers}

    print(f"Loading CSV dataset from {dataset_path}")
    df = pd.read_csv(dataset_path)
    # Just taking the first 20 items in the dataset for debugging
    # df = df.head(20)

    print(f"Processing {len(df)} examples")

    # Initialize model
    model_name = 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B'
    print(f"Initializing model {model_name}")
    model = LanguageModel(model_name, device_map="auto")
    batch_size = 16
    prompts = []
    max_tokens = 150

    # Process each example
    for idx, row in enumerate(tqdm(df.itertuples())):
        chat = [{"role": "user", "content": row.forbidden_prompt}]
        # tokens = model.tokenizer.apply_chat_template(chat, add_generation_prompt=True)
        prompt_tokens = model.tokenizer.apply_chat_template(chat, add_generation_prompt=True)
        # print("prompt_tokens:", prompt_tokens)
        prompts.append(row.forbidden_prompt)
        # Mark where CoT begins (exact boundary)
        cot_start_idx = len(prompt_tokens)

        if cot == True:
            response_tokens = model.tokenizer.encode(row.response, add_special_tokens=False)
            # print("row.response:", row.response)
            # print("response_tokens:", response_tokens)
            tokens = prompt_tokens + response_tokens
            # print("tokens:", tokens)
            end_idx= cot_start_idx + max_tokens
            # print("cot_end_idx", cot_end_idx) 
            start_pos = cot_start_idx
        else:
            tokens = prompt_tokens
            end_idx = cot_start_idx
            start_pos = cot_start_idx - 3
            
            
        # Initialize dict to collect activations for this example across all layers
        example_layer_activations = {layer: [] for layer in layers}
        token_positions = []
            
        # Process in batches to avoid memory issues
        for start_idx in range(0, end_idx, batch_size):
            batch_idx = min(start_idx + batch_size, end_idx)
            batch_tokens = tokens[start_idx:batch_idx]
            # Store the token positions for this batch
            token_positions.append((start_idx, batch_idx))
            # Convert batch tokens to input text
            input_text = model.tokenizer.decode(batch_tokens)
            # print("input_text", repr(input_text))
            # Run forward pass with NNsight
            with torch.no_grad():  # Disable gradient tracking to save memory
                with model.trace(input_text) as tracer:
                    # Save activations for each layer
                    for layer in layers:
                        # DeepSeek-R1-Distill-Llama-8B uses input_layernorm.input for residual stream
                        # The structure is model.layers[layer_num].input_layernorm.input
                        activation = model.model.layers[layer].input_layernorm.input.save()
                        example_layer_activations[layer].append(activation)

            # Clear CUDA cache after each batch
            torch.cuda.empty_cache()
            
        # Compute means and add to matrices
        for layer in layers:
            layer_activations = torch.cat(example_layer_activations[layer], dim=1)
            # layer_activations.shape = torch.Size([1, 17, 4096])
            # last_3_tokens = torch.Size([1,3,4096])
            # Get the last 3 tokens of the sequence
            # If cot_start_idx < 3, take all available tokens
            select_tokens = layer_activations[:, start_pos: end_idx, :]
            
            # Print the actual token IDs and text for verification
            # Compute mean across tokens (dimension 1)
            mean_activation = torch.mean(select_tokens, dim=1).detach().cpu().numpy()

            # # x is when CoT begins
            # first_n_tokens = layer_activations[:, cot_start_idx : cot_start_idx + max_tokens, :]
            # print("first_n_tokens, shape:", first_n_tokens, first_n_tokens.shape)
            # mean_activation = torch.mean(first_n_tokens, dim=1).detach().cpu().numpy()

            activation_matrices[layer].append(mean_activation.squeeze())
        # Clear CUDA cache after each batch
        torch.cuda.empty_cache()
        
    # Save activation matrices for each layer
    for layer, activations in activation_matrices.items():
        activation_matrix = np.stack(activations)

    return activation_matrix, prompts

In [3]:
def set_plotting_settings():
    """Set matplotlib settings for better visualizations"""
    plt.rcParams['figure.figsize'] = (10, 8)
    plt.rcParams['font.size'] = 12
    plt.rcParams['axes.labelsize'] = 14
    plt.rcParams['axes.titlesize'] = 16
    plt.rcParams['xtick.labelsize'] = 12
    plt.rcParams['ytick.labelsize'] = 12
    plt.rcParams['legend.fontsize'] = 12

def plot_pca_comparison_interactive(activations1, prompts1, activations2, prompts2, n_components=2, labels=None, title="PCA Comparison of Activations", layer=None):
    """
    Create an interactive PCA comparison plot optimized for Jupyter notebooks
    
    Args:
        activations1: First set of activations (numpy array)
        prompts1: List of prompts corresponding to activations1
        activations2: Second set of activations (numpy array)
        prompts2: List of prompts corresponding to activations2
        n_components: Number of PCA components to use
        labels: Labels for the two datasets
        title: Plot title
        layer: Layer number for reference
    """
    # Set plotting settings
    set_plotting_settings()
    
    # Combine activations for PCA fitting
    combined_activations = np.vstack([activations1, activations2])
    
    # # Add normalization for better PCA results
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    combined_activations_scaled = scaler.fit_transform(combined_activations)
    
    # Apply PCA
    pca = PCA(n_components=n_components)
    pca.fit(combined_activations_scaled)
    
    # Transform data
    # Transform data
    activations1_scaled = scaler.transform(activations1)
    activations2_scaled = scaler.transform(activations2)
    activations1_pca = pca.transform(activations1_scaled)
    activations2_pca = pca.transform(activations2_scaled)
    
    # Calculate explained variance
    explained_variance = pca.explained_variance_ratio_[:n_components]
    print(f"Explained variance ratio: {explained_variance}")
    print(f"Total explained variance: {sum(explained_variance):.4f}")
    
    # Create a DataFrame for easier manipulation
    df1 = pd.DataFrame({
        'PC1': activations1_pca[:, 0],
        'PC2': activations1_pca[:, 1],
        'prompt': prompts1,
        'category': labels[0] if labels else "Activations 1"
    })
    
    df2 = pd.DataFrame({
        'PC1': activations2_pca[:, 0],
        'PC2': activations2_pca[:, 1],
        'prompt': prompts2,
        'category': labels[1] if labels else "Activations 2"
    })
    
    df = pd.concat([df1, df2], ignore_index=True)
    
    # Create a figure for displaying when plotly is not available
    if not HAVE_PLOTLY:
        # Create standard matplotlib plot with hover labels (works in interactive matplotlib environments)
        fig, ax = plt.subplots(figsize=(12, 10))
        
        scatter1 = ax.scatter(
            activations1_pca[:, 0], 
            activations1_pca[:, 1], 
            color='blue', 
            alpha=0.6, 
            label=labels[0] if labels else "Activations 1"
        )
        
        scatter2 = ax.scatter(
            activations2_pca[:, 0], 
            activations2_pca[:, 1], 
            color='red', 
            alpha=0.6, 
            label=labels[1] if labels else "Activations 2"
        )
        
        # Add explained variance to axis labels
        ax.set_xlabel(f'PC 1 ({explained_variance[0]:.2%} variance)')
        ax.set_ylabel(f'PC 2 ({explained_variance[1]:.2%} variance)')
        
        # Add title and legend
        ax.set_title(title)
        ax.legend()
        
        # Add grid
        ax.grid(True, linestyle='--', alpha=0.7)
        
        # Function to display prompt on hover
        annotation = ax.annotate("", xy=(0, 0), xytext=(20, 20), textcoords="offset points",
                               bbox=dict(boxstyle="round", fc="white", alpha=0.8),
                               arrowprops=dict(arrowstyle="->"))
        annotation.set_visible(False)
        
        def hover(event):
            if event.inaxes == ax:
                cont1, ind1 = scatter1.contains(event)
                cont2, ind2 = scatter2.contains(event)
                
                if cont1:
                    pos = activations1_pca[ind1["ind"][0]]
                    annotation.xy = pos
                    annotation.set_text(prompts1[ind1["ind"][0]])
                    annotation.set_visible(True)
                    fig.canvas.draw_idle()
                elif cont2:
                    pos = activations2_pca[ind2["ind"][0]]
                    annotation.xy = pos
                    annotation.set_text(prompts2[ind2["ind"][0]])
                    annotation.set_visible(True)
                    fig.canvas.draw_idle()
                else:
                    annotation.set_visible(False)
                    fig.canvas.draw_idle()
        
        fig.canvas.mpl_connect("motion_notify_event", hover)
        plt.tight_layout()
        
        # Save for reference
        layer_str = f"_layer_{layer}" if layer is not None else ""
        plt.savefig(f"pca_with_prompts{layer_str}.png", dpi=300, bbox_inches='tight')
        plt.show()
        
        # Display a message
        print("Note: For better interactivity, install plotly and ipywidgets: pip install plotly ipywidgets")
        
        return {
            "pca": pca,
            "activations1_pca": activations1_pca,
            "activations2_pca": activations2_pca,
            "explained_variance": explained_variance
        }
    
    else:
        # Create an interactive Plotly figure that's truly interactive in Jupyter
        fig = px.scatter(
            df, x='PC1', y='PC2', 
            color='category', 
            hover_data={'prompt': True, 'PC1': ':.3f', 'PC2': ':.3f'},
            title=title,
            labels={
                'PC1': f'PC 1 ({explained_variance[0]:.2%} variance)',
                'PC2': f'PC 2 ({explained_variance[1]:.2%} variance)'
            }
        )
        
        # Customize figure
        fig.update_traces(marker=dict(size=10, opacity=0.7))
        fig.update_layout(
            legend_title_text='Category',
            width=900,
            height=700
        )
        
        # Display the figure
        fig.show()
        
        # Return results
        return {
            "pca": pca,
            "df": df,
            "activations1_pca": activations1_pca,
            "activations2_pca": activations2_pca,
            "explained_variance": explained_variance
        }

In [None]:
%pip install nbformat>=4.2.0 

In [4]:
def main():
    ndims = 2
    layers = [17]
    labels = ["cautious activations", "non-cautious activations"]
    
    for layer in layers:
        # Load activations with prompts
        activations_cautious, prompts_cautious = collect_activations("../dataset/cautious.csv", cot = False)
        activations_noncautious, prompts_noncautious = collect_activations("../dataset/non_cautious.csv", cot = False)
        
        if activations_cautious is None or activations_noncautious is None:
            print("Failed to load one or both activation files. Exiting.")
            return
        
        # Check dimensions
        if activations_cautious.shape != activations_noncautious.shape:
            print(f"Warning: Activation shapes don't match - {activations_cautious.shape} vs {activations_noncautious.shape}")
        
        # Plot PCA comparison with prompts
        result = plot_pca_comparison_interactive(
            activations_cautious,
            prompts_cautious,
            activations_noncautious,
            prompts_noncautious,
            n_components=ndims,
            labels=labels,
            title=f"PCA Comparison of Activations in layer {layer}",
            layer = layer
        )

main()

CUDA available: True
Loading CSV dataset from ../dataset/cautious.csv
Processing 106 examples
Initializing model deepseek-ai/DeepSeek-R1-Distill-Llama-8B


0it [00:00, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

106it [00:39,  2.68it/s]


CUDA available: True
Loading CSV dataset from ../dataset/non_cautious.csv
Processing 106 examples
Initializing model deepseek-ai/DeepSeek-R1-Distill-Llama-8B


0it [00:00, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

106it [00:39,  2.69it/s]


Explained variance ratio: [0.18782477 0.09548052]
Total explained variance: 0.2833


In [5]:
def main():
    ndims = 2
    layers = [17]
    labels = ["cautious activations", "non-cautious activations"]
    
    for layer in layers:
        # Load activations with prompts
        activations_cautious, prompts_cautious = collect_activations("../dataset/cautious.csv", cot = True)
        activations_noncautious, prompts_noncautious = collect_activations("../dataset/non_cautious.csv", cot = True)
        
        if activations_cautious is None or activations_noncautious is None:
            print("Failed to load one or both activation files. Exiting.")
            return
        
        # Check dimensions
        if activations_cautious.shape != activations_noncautious.shape:
            print(f"Warning: Activation shapes don't match - {activations_cautious.shape} vs {activations_noncautious.shape}")
        
        # Plot PCA comparison with prompts
        result = plot_pca_comparison_interactive(
            activations_cautious,
            prompts_cautious,
            activations_noncautious,
            prompts_noncautious,
            n_components=ndims,
            labels=labels,
            title=f"PCA Comparison of Activations in layer {layer}",
            layer = layer
        )

main()

CUDA available: True
Loading CSV dataset from ../dataset/cautious.csv
Processing 106 examples
Initializing model deepseek-ai/DeepSeek-R1-Distill-Llama-8B


0it [00:00, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

106it [02:12,  1.25s/it]


CUDA available: True
Loading CSV dataset from ../dataset/non_cautious.csv
Processing 106 examples
Initializing model deepseek-ai/DeepSeek-R1-Distill-Llama-8B


0it [00:00, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

106it [02:14,  1.27s/it]

Explained variance ratio: [0.06683105 0.05692501]
Total explained variance: 0.1238



