In [32]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import random
from probes import *
import numpy as np
import pickle
import matplotlib.pyplot as plt
torch.set_printoptions(threshold=20)

In [7]:
from tqdm.notebook import tqdm

In [3]:
with open('prompt_output.json', 'r') as f:
    prompts_data = json.load(f)

In [4]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float32, device_map="auto")

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.73s/it]
Some parameters are on the meta device because they were offloaded to the disk.


In [5]:
batch_size = 8

In [None]:
# debug
strings = []
for demo_dict in prompts_data:
    for k, v in demo_dict.items():
        question_convos = [
                    [{"role": "user", "content": current_convo}]
                    for current_convo in v
                ]
        strings += question_convos
    


In [40]:
# for ques in strings[0:5]:
#     print( 
#         tokenizer.apply_chat_template(
#     ques,
#     add_generation_prompt=True,
#     return_tensors="pt" # Return the input as PyTorch tensors
#     )
#     )
len(answer)

1

In [44]:
# Get model responses - unsteered
# ~ 7 mins per question (v slow, see how to speed this up)
model_responses = []
for ques in strings[0:1]:
    tokenized_chat = tokenizer.apply_chat_template(
    ques,
    add_generation_prompt=True,
    return_tensors="pt" # Return the input as PyTorch tensors
    )
    with torch.no_grad():
        output_ids = model.generate(
            tokenized_chat,
            do_sample=False,
            max_new_tokens=200,
        )
    prompt_text = tokenizer.decode(tokenized_chat[0], skip_special_tokens=True)
    full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Get only the new text generated after the prompt
    generated_part = full_output[len(prompt_text):].strip()
    model_responses.append(generated_part)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [None]:
# Compare with model() instead of model.generate()
# https://discuss.huggingface.co/t/generate-without-using-the-generate-method/11379
# ~ 7 mins per question (v slow, see how to speed this up)
model_responses = []
for ques in strings[0:1]:
    input_ids = tokenizer.apply_chat_template(
    ques,
    add_generation_prompt=True,
    return_tensors="pt" # Return the input as PyTorch tensors
    )#.input_ids()
    decoder_input_ids = [model.config.decoder_start_token_id]
    
    # predicted_ids = []
    # print(decoder_input_ids)
    # print(type(decoder_input_ids))

    # for i in range(20): 
    #     outputs = model(input_ids=input_ids, decoder_input_ids=torch.tensor([decoder_input_ids]))
    #     logits = outputs.logits[:,i,:]
    #     # perform argmax on the last dimension (i.e. greedy decoding)
    #     predicted_id = logits.argmax(-1)
    #     predicted_ids.append(predicted_id.item())
    #     print(tokenizer.decode([predicted_id.squeeze()]))
    #     # add predicted id to decoder_input_ids


[None]
<class 'list'>


RuntimeError: Could not infer dtype of NoneType

#### Calculate steering vector

In [56]:
with open(f'probe_pickles/model0.pkl','rb') as f:
    x = pickle.load(f)

In [58]:
probes = {}
n_layers = 29
for i in range(n_layers):
    # probe_pickles/model0.pkl
    # print(i)
    with open(f'probe_pickles/model{i}.pkl','rb') as f:
        x = pickle.load(f)
        probes[i] = x

In [61]:
W = probes[28].coef_

In [None]:
#####################
def make_steering_hook(w, alpha=1.0):
    def hook_fn(module, input, output):
        # output shape: [batch, seq_len, hidden_dim]
        return output + alpha * w.to(output.device)
    return hook_fn
# from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

layer_to_steer = 20  # pick the layer where your probe was trained
alpha = 3.0          # steering strength

hook = model.model.layers[layer_to_steer].register_forward_hook(make_steering_hook(w, alpha))
prompt = "The person described in the passage is"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))

hook.remove()  # clean up after steering

In [45]:
generated_part

"Hi Jin, I'd be happy to help you find some classes at the University of Queensland that might interest you.\n\nBased on your preferences, here are some class suggestions:\n\n**Engineering-related classes:**\n\n1. Engineering Management: This class will help you understand the business side of engineering and how to apply engineering principles to real-world problems.\n2. Operations Research: This class will teach you how to use mathematical and analytical techniques to optimize business processes and solve complex problems.\n3. Entrepreneurship: This class will help you develop the skills and knowledge needed to start and run your own business.\n\n**Economics-related classes:**\n\n1. Microeconomics: This class will introduce you to the fundamental concepts of microeconomics, including supply and demand, market structures, and consumer behavior.\n2. Macroeconomics: This class will cover the broader topics of macroeconomics, including economic growth, inflation, and international trade.

### Use probes created in probes.ipynb to steer the model

In [None]:
# from https://github.com/Veranep/implicit-personalization-stereotypes/blob/main/mitigate.py#L48
def optimize_one_inter_rep(
    inter_rep, layer_name, target, probe, mult, normalized=False
):
    """Add probe weights to model's internal representations"""
    global first_time
    tensor = (inter_rep.clone()).to("cuda").requires_grad_(True)
    rep_f = lambda: tensor
    probe_weights = torch.from_numpy(probe.coef_[target]).to("cuda")

    if normalized:
        cur_input_tensor = (
            rep_f() + probe_weights * mult * 100 / rep_f().norm()
        )
    else:
        cur_input_tensor = rep_f() + probe_weights * mult
    return cur_input_tensor.clone()

def edit_inter_rep_multi_layers(output, layer_name):
    """Apply probe weight adding to all relevant layers of the model"""
    layer_num = int(
        layer_name[layer_name.rfind("model.layers.") + len("model.layers.") :]
    )
    probe = probes_dict[layer_num]
    cloned_inter_rep = (
        output[0][:, -1].unsqueeze(0).detach().clone().to(torch.float)
    )
    with torch.enable_grad():
        cloned_inter_rep = optimize_one_inter_rep(
            cloned_inter_rep,
            layer_name,
            cf_target,
            probe,
            mult=mult,
            normalized=False,
        )
    output[0][:, -1] = cloned_inter_rep.to(torch.float16)
    return output

In [2]:
mod_list = []
for i in range(29):
    # load
    with open(f'probe_pickles/model{i}.pkl', 'rb') as f:
        mod_list.append(pickle.load(f))

In [None]:
# from https://github.com/Veranep/implicit-personalization-stereotypes/blob/main/eval_conversations.py#L299

def modified_model(
    model,
    probes,
    modified_layer_names,
    demographic,
    value,
    batch_size,
    question_convos,
    N,
):
    """Apply steering towards demographic group to model and generate response"""
    global probes_dict
    probes_dict = probes
    global mult
    mult = N
    global cf_target
    cf_target = probe_targets[demographic][value]
    with TraceDict(
        model.model,
        modified_layer_names,
        edit_output=edit_inter_rep_multi_layers,
    ) as ret:
        model_answer = [
            answer[0]["generated_text"][-1]["content"]
            for answer in tqdm(
                model(
                    question_convos,
                    batch_size=batch_size,
                    do_sample=False,
                    max_new_tokens=100,
                ),
                total=len(question_convos),
            )
        ]
    return model_answer

In [None]:
# from https://github.com/Veranep/implicit-personalization-stereotypes/blob/main/eval_conversations.py#L299
new_answers = modified_model(
                model,
                probes,
                modified_layer_names,
                demographic,
                target,
                batch_size,
                question_convos,
                N,
            )
return_convos[i]["mod_indirect_question"][k] = new_answers