In [1]:
# TRANSFORMER FIX
import sys
sys.path.append('/root/CAA/venv/lib/python3.10/site-packages')
print(sys.path)

['/root/CAA', '/opt/conda/lib/python310.zip', '/opt/conda/lib/python3.10', '/opt/conda/lib/python3.10/lib-dynload', '', '/opt/conda/lib/python3.10/site-packages', '/root/CAA/venv/lib/python3.10/site-packages']


In [24]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import time
import torch

from dotenv import load_dotenv
from behaviors import ALL_BEHAVIORS, BASE_DIR, COORDINATE, get_vector_path
from gemma_2_wrapper import Gemma2Wrapper
from generate_vectors import generate_save_vectors_for_behavior
import gemma_vector_analysis
from huggingface_hub import hf_hub_download
import numpy as np
import torch.nn as nn

from behaviors import ALL_BEHAVIORS, BASE_DIR, COORDINATE, get_vector_path


In [13]:
load_dotenv()
HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")

In [35]:
#Load SAE
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
layer = 12

# Load the SAE model
sae_model_name = "google/gemma-scope-2b-pt-res"
path_to_params = hf_hub_download(
    repo_id=sae_model_name,
    filename=f"layer_{layer}/width_1m/average_l0_207/params.npz",
    force_download=False,
)

params = np.load(path_to_params)
pt_params = {k: torch.from_numpy(v).to(device) for k, v in params.items()}


class JumpReLUSAE(nn.Module):
    def __init__(self, d_model, d_sae):
        super().__init__()
        self.W_enc = nn.Parameter(torch.zeros(d_model, d_sae))
        self.W_dec = nn.Parameter(torch.zeros(d_sae, d_model))
        self.threshold = nn.Parameter(torch.zeros(d_sae))
        self.b_enc = nn.Parameter(torch.zeros(d_sae))
        self.b_dec = nn.Parameter(torch.zeros(d_model))

    def encode(self, input_acts):
        pre_acts = input_acts @ self.W_enc + self.b_enc
        mask = (pre_acts > self.threshold)
        acts = mask * torch.nn.functional.relu(pre_acts)
        return acts

    def decode(self, acts):
        return acts @ self.W_dec + self.b_dec

    def forward(self, acts):
        acts = self.encode(acts)
        recon = self.decode(acts)
        return recon


sae = JumpReLUSAE(params['W_enc'].shape[0], params['W_enc'].shape[1])
sae.load_state_dict(pt_params)
sae = sae.to(device)
print("Success")


params.npz:   0%|          | 0.00/19.3G [00:00<?, ?B/s]

Success


In [36]:
print(f"SAE model width: Encoder width = {sae.W_enc.shape[1]}, Decoder width = {sae.W_dec.shape[0]}")

SAE model width: Encoder width = 1048576, Decoder width = 1048576


In [62]:
#Load CAA Vector
behavior = "sycophancy"
model_name_path = 'gemma-2-2b'
vector_path = gemma_vector_analysis.fix_vector_path(get_vector_path(behavior, layer, model_name_path))
normalized_dir = os.path.join(BASE_DIR, 'normalized_vectors', behavior)

# Load the existing normalized vector
original_path = os.path.join(normalized_dir, f"vec_layer_{layer}_{model_name_path}.pt")
original_path = gemma_vector_analysis.fix_vector_path(original_path)
caa_vector = torch.load(original_path)

print(f"Analyzing vector for behavior: {behavior}")
print(f"Vector shape: {caa_vector.shape}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move caa_vector to device
caa_vector = caa_vector.to(device)

print(f"Norm of the original vector: {caa_vector.norm().item():.4f}")
print("First 20 entries of the vector:")
print(caa_vector[:20])

Analyzing vector for behavior: sycophancy
Vector shape: torch.Size([2304])
Using device: cuda
Norm of the original vector: 42.7291
First 20 entries of the vector:
tensor([ 0.1404, -0.4736, -0.0243,  1.1708,  0.8371, -0.7364,  1.7948,  0.8291,
         1.6945,  0.3871,  0.4149, -0.8828,  1.4796, -0.9835,  0.2753,  0.6525,
        -0.8137,  1.2188, -0.5540, -0.0628], device='cuda:0')


In [63]:
#Forward CAA vector in SAE
forwarded_caa_vector = sae.forward(caa_vector)
print(f"Vector shape: {forwarded_caa_vector.shape}")
print(f"Norm of the forwarded vector: {forwarded_caa_vector.norm().item():.4f}")
print("First 20 entries of the vector:")
print(forwarded_caa_vector[:20])

Vector shape: torch.Size([2304])
Norm of the forwarded vector: 111.7115
First 20 entries of the vector:
tensor([-0.6929,  0.8543,  0.2417,  0.2384,  2.3269,  2.0966,  1.3398, -1.1441,
        -0.3486,  1.4361,  0.6083,  2.3244,  3.4588, -2.4025,  0.8034, -2.9251,
         0.1457, -2.1802, -6.9103,  0.4804], device='cuda:0',
       grad_fn=<SliceBackward0>)


In [61]:
#Cosine similarity after forward
import torch.nn.functional as F

# Ensure both vectors are on the same device and have the same shape
caa_vector = caa_vector.to(device)
forwarded_caa_vector = forwarded_caa_vector.to(device)

# Reshape vectors to 2D if they're not already (cosine_similarity expects 2D tensors)
caa_vector = caa_vector.view(1, -1)
forwarded_caa_vector = forwarded_caa_vector.view(1, -1)

# Calculate cosine similarity
cosine_sim = F.cosine_similarity(caa_vector, forwarded_caa_vector, dim=1)

# Print the result
print(f"Cosine similarity between original and forwarded vector: {cosine_sim.item():.4f}")

Cosine similarity between original and forwarded vector: 0.2163


In [None]:
# Delete the downloaded file after use
if os.path.exists(path_to_params):
    #os.remove(path_to_params)
    #print(f"Deleted the downloaded file: {path_to_params}")