In [1]:
from baukit import Trace
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# get the steering vector
layer_id = 5
#module = model.layers[layer_id]

In [41]:
# define the activation steering hook
def act_add(steering_vec):
    def hook(module, input, output):
        #print(len(output))
        #print(output[0][0][0].shape)
        #print(len(output[0]))
        output[0][0][-1] += steering_vec
        return output
    return hook

# define a hook function that caches activations
def cache_hook(cache):
	def hook(module, input, output):
		cache.append(output)
	return hook

In [181]:
class model_layer_wrapper:
    def __init__(self, model='gpt2', device=0):
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(
            model,
            pad_token="[PAD]",
            padding_side='left')
        self.model = AutoModelForCausalLM.from_pretrained(
                model).to(self.device)
        #self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.model_name = model
         
    def get_layer_next_output(self, layer_index):
        if 'llama' in self.model_name:
            return self.model.model.layers[layer_index]
        elif 'gpt2' in self.model_name:
            return self.model.transformer.h[layer_index]
        elif 'opt' in self.model_name:
            return self.model.model.decoder.layers[layer_index]
        
    def get_activation(self, input_string, layer_index):
        module = self.get_layer_next_output(layer_index)
        cache = []
        handle = module.register_forward_hook(cache_hook(cache))
        #with Trace(module) as ret:
        #    _ = self.model(torch.tensor(self.tokenizer.encode(input_string)).unsqueeze(0).to(self.device))
        #    answer = ret.output
        _ = self.model(torch.tensor(self.tokenizer.encode(input_string)).unsqueeze(0).to(self.device))
        result = cache[0]
        handle.remove()
        return result
    
    def generate_with_steering(self, test_inputs, layer_index, steering_vec):
        module = self.get_layer_next_output(layer_index)
        cache = []
        handle = module.register_forward_hook(act_add(steering_vec))
        #with Trace(module, edit_output=act_add(steering_vec)) as _:
        test_encoding = self.tokenizer(test_inputs, padding=True, return_tensors = 'pt').to(self.device)
        sample_outputs = self.model.generate(**test_encoding, pad_token_id=self.tokenizer.pad_token_id,do_sample=True, max_new_tokens=20, top_k=10, num_return_sequences=1)
        ans = self.tokenizer.batch_decode(sample_outputs, skip_special_tokens=True)
        handle.remove() 
        return ans

In [8]:
import json
filename = '/data/jianyu/secretlanguage/rl-prompt/examples/steer_vector/tst_paired_50.jsonl'
with open(filename, "r") as f:
    data_sentiment = list(f)#json.load(f)
data_sentiment = [json.loads(_) for _ in data_sentiment]

In [None]:
whole_sentences[0]

In [9]:
whole_sentences = []
pos_sentences = []
neg_sentences = []
for element in data_sentiment:
    whole_sentences.append(element['neg'])
    neg_sentences.append(element['neg'])
    
    pos_sentences.append(element['pos'])
    whole_sentences.append(element['pos'])

In [182]:
model = model_layer_wrapper('gpt2', 3)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [114]:
def template_form(template):
    return f"\"{template}\""

In [121]:
temp0 = model.get_activation(whole_sentences[0], 7)[0][0, -1,:] - model.get_activation(whole_sentences[1], 7)[0][0, -1,:] #[0].shape

In [122]:
temp1 = model.get_activation(whole_sentences[2], 7)[0][0, -1,:] - model.get_activation(whole_sentences[3], 7)[0][0, -1,:] #[0].shape

In [123]:
temp2 = model.get_activation(whole_sentences[4], 7)[0][0, -1,:] - model.get_activation(whole_sentences[5], 7)[0][0, -1,:] #[0].shape

In [124]:
temp3 = model.get_activation(whole_sentences[6], 7)[0][0, -1,:] - model.get_activation(whole_sentences[7], 7)[0][0, -1,:] #[0].shape

In [125]:
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
cos(temp0.unsqueeze(0), temp1.unsqueeze(0))

tensor([0.3588], device='cuda:3', grad_fn=<SumBackward1>)

In [126]:
cos(temp0.unsqueeze(0), temp2.unsqueeze(0))

tensor([0.3312], device='cuda:3', grad_fn=<SumBackward1>)

In [17]:
model.model.transformer.h[0]#(whole_sentences[1]).output

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [18]:
model.get_activation(whole_sentences[0], 8)[1][0][0].shape#[2].shape

torch.Size([12, 15, 64])

In [19]:
len(model.tokenizer.tokenize(whole_sentences[0]))

15

In [128]:
def steering_vector_calculate(model, input_string_pairs, layer_index):
    # [a, b]
    vec0 = model.get_activation(input_string_pairs[0], layer_index)[0][0, -1,:]
    vec1 = model.get_activation(input_string_pairs[1], layer_index)[0][0, -1,:]
    
    return vec0, vec1

In [129]:
vecs = []
vec1s = []
vec0s = []
for i in range(len(neg_sentences)):
    vec0, vec1 = steering_vector_calculate(model, [neg_sentences[i], pos_sentences[i]], 7)
    vec0s.append(vec0)
    vec1s.append(vec1)

In [130]:
final_vec = torch.mean(torch.stack(vec1s), 0) - torch.mean(torch.stack(vec0s), 0)

In [131]:
final_vec.shape

torch.Size([768])

In [132]:
final = []
for i in range(len(vecs)):
    for j in range(i+1, len(vecs)):
        final.append(cos(vecs[i].unsqueeze(0), vecs[j].unsqueeze(0)))

In [133]:
final

[]

In [192]:
CUDA_LAUNCH_BLOCKING = "1"
elements = []
for _ in range(32):
    elements.append(model.generate_with_steering(neg_sentences[11], 7, 1.2*final_vec)[0].replace("\""+neg_sentences[11]+"\" \"", ""))

In [193]:
neg_sentences[44]

"you won't find a better selection in scottsdale."

In [194]:
elements

["the charge did include miso soup and a small salad.\n\nThe couple is now planning an evening of food and drinks with the mayor's daughter, daughter",
 'the charge did include miso soup and a small salad. I also enjoyed the grilled chicken, and there was a nice little bit of a cheese dip on the',
 'the charge did include miso soup and a small salad.\n\n"It was very pleasant. We were talking for a short while about this and the future',
 'the charge did include miso soup and a small salad.\n\n"The whole thing was just kind of fun," he said.',
 'the charge did include miso soup and a small salad. The meal was good enough for me. It was a good meal too and it also included some other',
 "the charge did include miso soup and a small salad. It's not like we're the only ones who are eating at a restaurant with the idea of having",
 'the charge did include miso soup and a small salad.\n\nIt was not a very big affair.\n\nThe food was delicious, especially in the',
 'the charge did include mis

In [105]:
elements

["I am just making the point that you'll always have the perfect place, and I'm sure you",
 'You have no problems with that." "I know what you mean," says the woman who runs the',
 "The best thing about the town is that you're not a scotch or a kid with no",
 'There are plenty of scotties there, too."\n\nThe two of them, with their',
 'It is also easy to spot the best students in the city in any given year."\n\n"',
 'The best are the ones that get to the point where they don\'t need you."\n\n"',
 "You'll find a great place for a family to live as well as a business with a good location",
 'A lot of them are just good people, but not as good as some of the kids in my',
 'You\'re not wrong." "The kids are doing a great job." "I\'ve seen a lot',
 'You\'ll just be able to have a great deal of conversation with the locals."\n\nAs the',
 'You will find a few that are easy to keep in the house."\n\nI am the most',
 'I\'m sure you\'ll find a lot of people who can get you all the way in," says'

In [None]:
neg_sentences[0:3]

In [85]:
pos_sentences[0:3]

["ever since joes has changed hands it's gotten better and better.",
 'there is so much room in that part of the venue.',
 "it didn't taste watered down at all."]