In [1]:
# TRANSFORMER / ENVIRONMENT FIX
import sys
sys.path.append('/root/CAA/venv/lib/python3.10/site-packages')
print(sys.path)

['/opt/conda/lib/python310.zip', '/opt/conda/lib/python3.10', '/opt/conda/lib/python3.10/lib-dynload', '', '/opt/conda/lib/python3.10/site-packages', '/root/CAA/venv/lib/python3.10/site-packages']


In [2]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import time
import torch

from dotenv import load_dotenv
from behaviors import ALL_BEHAVIORS, BASE_DIR, COORDINATE, get_vector_path
from gemma_2_wrapper import Gemma2Wrapper
from generate_vectors import generate_save_vectors_for_behavior
import gemma_vector_analysis
from huggingface_hub import hf_hub_download
import numpy as np
import torch.nn as nn

In [3]:
load_dotenv()
HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")

In [4]:
#Load SAE
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
layer = 14

# Load the SAE model
sae_model_name = "google/gemma-scope-2b-pt-res"
path_to_params = hf_hub_download(
    repo_id=sae_model_name,
    filename=f"layer_{layer}/width_16k/average_l0_84/params.npz",
    force_download=False,
)

params = np.load(path_to_params)
pt_params = {k: torch.from_numpy(v).to(device) for k, v in params.items()}


class JumpReLUSAE(nn.Module):
    def __init__(self, d_model, d_sae):
        super().__init__()
        self.W_enc = nn.Parameter(torch.zeros(d_model, d_sae))
        self.W_dec = nn.Parameter(torch.zeros(d_sae, d_model))
        self.threshold = nn.Parameter(torch.zeros(d_sae))
        self.b_enc = nn.Parameter(torch.zeros(d_sae))
        self.b_dec = nn.Parameter(torch.zeros(d_model))

    def encode(self, input_acts):
        pre_acts = input_acts @ self.W_enc + self.b_enc
        mask = (pre_acts > self.threshold)
        acts = mask * torch.nn.functional.relu(pre_acts)
        return acts

    def decode(self, acts):
        return acts @ self.W_dec + self.b_dec

    def forward(self, acts):
        acts = self.encode(acts)
        recon = self.decode(acts)
        return recon


sae = JumpReLUSAE(params['W_enc'].shape[0], params['W_enc'].shape[1])
sae.load_state_dict(pt_params)
sae = sae.to(device)
print("Success")


params.npz:   0%|          | 0.00/302M [00:00<?, ?B/s]

Success


In [5]:
print(f"SAE model width: Encoder width = {sae.W_enc.shape[1]}, Decoder width = {sae.W_dec.shape[0]}")

SAE model width: Encoder width = 16384, Decoder width = 16384


In [6]:
#Load CAA Vector
layer = 14
for behavior in ALL_BEHAVIORS:
    model_name_path = 'gemma-2-2b'
    vector_path = gemma_vector_analysis.fix_vector_path(get_vector_path(behavior, layer, model_name_path))
    normalized_dir = os.path.join(BASE_DIR, 'normalized_vectors', behavior)
    
    # Load the existing normalized vector
    original_path = os.path.join(normalized_dir, f"vec_layer_{layer}_{model_name_path}.pt")
    original_path = gemma_vector_analysis.fix_vector_path(original_path)
    caa_vector = torch.load(original_path)
    
    print(f"Analyzing vector for behavior: {behavior}")
    #print(f"Vector shape: {caa_vector.shape}")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #print(f"Using device: {device}")
    
    # Move caa_vector to device
    caa_vector = caa_vector.to(device)

    #Forward CAA vector in SAE
    forwarded_caa_vector = sae.forward(caa_vector)
    print(f"Norm of the forwarded vector: {forwarded_caa_vector.norm().item():.4f}")

    # Save the forwarded vector
    sae_vector_dir = '/root/CAA/sae_vector'
    if not os.path.exists(sae_vector_dir):
        os.makedirs(sae_vector_dir)
    
    forwarded_vector_path = os.path.join(sae_vector_dir, f"forwarded_{behavior}.pt")
    torch.save(forwarded_caa_vector, forwarded_vector_path)
    print(f"Saved forwarded vector for behavior {behavior} at {forwarded_vector_path}")
    

Layer: 0
Analyzing vector for behavior: coordinate-other-ais
Norm of the original vector: 3.6553
Analyzing vector for behavior: corrigible-neutral-HHH
Norm of the original vector: 3.6553
Analyzing vector for behavior: hallucination
Norm of the original vector: 3.6553
Analyzing vector for behavior: myopic-reward
Norm of the original vector: 3.6553
Analyzing vector for behavior: survival-instinct
Norm of the original vector: 3.6553
Analyzing vector for behavior: sycophancy
Norm of the original vector: 3.6553
Analyzing vector for behavior: refusal
Norm of the original vector: 3.6553
Layer: 1
Analyzing vector for behavior: coordinate-other-ais
Norm of the original vector: 3.4446
Analyzing vector for behavior: corrigible-neutral-HHH
Norm of the original vector: 3.4446
Analyzing vector for behavior: hallucination
Norm of the original vector: 3.4446
Analyzing vector for behavior: myopic-reward
Norm of the original vector: 3.4446
Analyzing vector for behavior: survival-instinct
Norm of the ori

In [None]:
#Forward CAA vector in SAE
forwarded_caa_vector = sae.forward(caa_vector)
print(f"Vector shape: {forwarded_caa_vector.shape}")
print(f"Norm of the forwarded vector: {forwarded_caa_vector.norm().item():.4f}")
print("First 20 entries of the vector:")
print(forwarded_caa_vector[:20])

In [None]:
#Cosine similarity after forward
import torch.nn.functional as F

# Ensure both vectors are on the same device and have the same shape
caa_vector = caa_vector.to(device)
forwarded_caa_vector = forwarded_caa_vector.to(device)

# Reshape vectors to 2D if they're not already (cosine_similarity expects 2D tensors)
caa_vector = caa_vector.view(1, -1)
forwarded_caa_vector = forwarded_caa_vector.view(1, -1)

# Calculate cosine similarity
cosine_sim = F.cosine_similarity(caa_vector, forwarded_caa_vector, dim=1)

# Print the result
print(f"Cosine similarity between original and forwarded vector: {cosine_sim.item():.4f}")

In [None]:
# Delete the downloaded file after use
if os.path.exists(path_to_params):
    #os.remove(path_to_params)
    #print(f"Deleted the downloaded file: {path_to_params}")