In [53]:
import torch
import numpy as np
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"

In [54]:
from sae_lens import SAE, HookedSAETransformer
model = HookedSAETransformer.from_pretrained("gpt2-small", device=device)
sae, cfg_dict, sparsity = SAE.from_pretrained(
    release="gpt2-small-res-jb-feature-splitting",  # <- Release name
    sae_id="blocks.8.hook_resid_pre_768",  # <- SAE id (not always a hook point!)
    device=device,
)

Loaded pretrained model gpt2-small into HookedTransformer


In [55]:
# Input sentence
input_sentence = "Who are you? I am"
inputs = model.tokenizer(input_sentence, return_tensors="pt").to(device)

# Get model outputs
with torch.no_grad():
    logits, activations = model.run_with_cache(input_sentence)

print(sae.encode_standard(activations['blocks.8.hook_resid_pre']).shape)

torch.Size([1, 7, 768])


In [25]:
sae.decode(sae.encode_standard(activations['blocks.8.hook_resid_pre'])).shape

torch.Size([1, 7, 768])

In [88]:
def steering_features(value, hook):
    # BERT 生成的768vector*activation vector
    
    #encoded_activation = sae.encode(value)
    #steered_vector = steering_vector*encoded_activation 
    #value = sae.decode(steered_vector)
    return value
    

fwd_hooks=[(
        'blocks.8.hook_resid_pre', 
        steering_features
        )]
prompt = "I am a person who"

with model.hooks(fwd_hooks=fwd_hooks):
    steered_text = model.generate(
        prompt,
        max_new_tokens=10,
        temperature=0.2,
        top_p=0.9,
        stop_at_eos = True,
        
    )

print(steered_text)

AttributeError: 'FASG_Model' object has no attribute 'hooks'

In [39]:
import torch
from torch import nn
from transformers import BertModel, BertTokenizer

class BertWithDense(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", output_dim=768):
        super(BertWithDense, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dense = nn.Linear(self.bert.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):

        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = bert_output.last_hidden_state
        cls_output = last_hidden_state[:, 0, :] 
        output = self.dense(cls_output)
        return output

In [43]:
text = ["This is an example.", "BERT with a dense layer!"]
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encoding = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]

output = model(input_ids, attention_mask)
print("Output shape:", output.shape)  # Should be (batch_size, 768)

        [1, 1, 1, 1, 1, 1, 1, 1]])


AttributeError: 'NoneType' object has no attribute 'shape'

In [45]:
## pseudo code

# Step 1: Use BERT to predict the 768 vector -> steering vector

# Our own model
class BertWithDense(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", output_dim=768):
        pass
    def forward(self, input_ids, attention_mask):
        pass

bert_model = BertWithDense()
#bert_model(tokenizer(x)) <--768 vector

# Step 2: Generate Text with LLM
from sae_lens import SAE, HookedSAETransformer

# model
model = HookedSAETransformer.from_pretrained("gpt2-small", device=device)

# sae
sae, cfg_dict, sparsity = SAE.from_pretrained(
    release="gpt2-small-res-jb-feature-splitting",  # <- Release name
    sae_id="blocks.8.hook_resid_pre_768",  # <- SAE id (not always a hook point!)
    device=device,
)

def steering_features(value, hook):
    # BERT 生成的768vector*activation vector
    
    #encoded_activation = sae.encode(value)
    #steered_vector = steering_vector*encoded_activation 
    #value = sae.decode(steered_vector)
    return value 

fwd_hooks=[(
        'blocks.8.hook_resid_pre', 
        steering_features
        )]


prompt = x[:10]

with model.hooks(fwd_hooks=fwd_hooks):
    steered_text = model.generate(
        prompt,
        max_new_tokens=10,
        temperature=0.2,
        top_p=0.9,
        stop_at_eos = True,
        
    )

# Step 3 -> calculate loss and gradient descent
# loss =  cosine_similarity(x,steered_text)
# with bert_model

In [128]:
import torch
from transformers import BertTokenizer,BertModel
from torch import nn

class FASG_Model(nn.Module):
    def __init__(self):
        super(FASG_Model, self).__init__()

        self.device = "cuda:1"
        self.bertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bertModel = BertModel.from_pretrained("bert-base-uncased").to(self.device)
        self.linear = torch.nn.Linear(768, 768).to(self.device)
        self.llm = HookedSAETransformer.from_pretrained("gpt2-small", device=self.device)
        self.sae, cfg_dict, sparsity = SAE.from_pretrained(
            release="gpt2-small-res-jb-feature-splitting",  
            sae_id="blocks.8.hook_resid_pre_768", 
            device=self.device,
        )
        

    def forward(self, text):
        encoded_input = tokenizer(text, return_tensors='pt',padding=True, truncation=True)
        steering_vector = self.bertModel(**encoded_input).pooler_output

        def steering_features(value, hook):
            encoded_activation = self.sae.encode(value)
            print(encoded_activation.shape)
            print(steering_vector.shape)
            steered_vector = steering_vector*encoded_activation 
            decoded_vector = sae.decode(steered_vector)
            return decoded_vector
    
        fwd_hooks=[(
            'blocks.8.hook_resid_pre', 
            steering_features
        )]

        prompt = " ".join(text.split(" ")[:7])
        
        with self.llm.hooks(fwd_hooks=fwd_hooks):
            steered_tokens = self.llm.generate(
            prompt,
            max_new_tokens=64,
            temperature=0.2,
            top_p=0.9,
            stop_at_eos = True,
            return_type = "tensor"
        )
        return steered_tokens

In [129]:
sample_text = "this is a sample text"
model = FASG_Model()
result= model(sample_text)

OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB. GPU 0 has a total capacity of 47.49 GiB of which 27.56 MiB is free. Process 1549599 has 5.65 GiB memory in use. Process 1880738 has 2.56 GiB memory in use. Process 2173725 has 3.12 GiB memory in use. Process 2169810 has 14.94 GiB memory in use. Process 2926054 has 2.60 GiB memory in use. Process 4116386 has 9.22 GiB memory in use. Process 13956 has 424.00 MiB memory in use. Including non-PyTorch memory, this process has 8.82 GiB memory in use. Of the allocated memory 7.87 GiB is allocated by PyTorch, and 464.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [118]:
torch.cuda.devices[1]

AttributeError: module 'torch.cuda' has no attribute 'devices'