In [5]:
import os
import sys
sys.path.append('root/TalkTuner-chatbot-llm-dashboard/src')
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm.auto import tqdm

from collections import OrderedDict

from src.dataset import llama_v2_prompt
import numpy as np



device = "cuda"
torch_device = "cuda"

## Instantiate the model

In [6]:
access_token = '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'

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf", token=access_token, padding_side='left')
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf", token=access_token)
model.half().cuda();
model.eval();

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Load the probe weights here

In [7]:
import torch
from src.probes import LinearProbeClassification

from src.intervention_utils import return_classifier_dict

classifier_type = LinearProbeClassification
classifier_directory = "probe_checkpoints/controlling_probe"
return_user_msg_last_act = True
include_inst = True
layer_num = None
mix_scaler = False
residual_stream = True
logistic = True
sklearn = False

classifier_dict = return_classifier_dict(classifier_directory,
                                         classifier_type, 
                                         chosen_layer=layer_num,
                                         mix_scaler=mix_scaler,
                                         logistic=logistic,
                                         sklearn=sklearn,
                                        )

  model.load_state_dict(torch.load(weight_path))


## Batched Intervention code

In [8]:
from baukit import TraceDict
from torch import nn


if '<pad>' not in tokenizer.get_vocab():
    tokenizer.add_special_tokens({"pad_token":"<pad>"})

model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
assert model.config.pad_token_id == tokenizer.pad_token_id, "The model's pad token ID does not match the tokenizer's pad token ID!"

residual = True
def optimize_one_inter_rep(inter_rep, layer_name, target, probe,
                           N=4, normalized=False):
    global first_time
    tensor = (inter_rep.clone()).to(torch_device).requires_grad_(True)
    rep_f = lambda: tensor
    target_clone = target.clone().to(torch_device).to(torch.float)

    cur_input_tensor = rep_f().clone().detach()

    if normalized:
        cur_input_tensor = rep_f() + target_clone.view(1, -1) @ probe.proj[0].weight * N * 100 / rep_f().norm() 
    else:
        cur_input_tensor = rep_f() + target_clone.view(1, -1) @ probe.proj[0].weight * N
    return cur_input_tensor.clone()


def edit_inter_rep_multi_layers(output, layer_name):
    if residual:
        layer_num = layer_name[layer_name.rfind("model.layers.") + len("model.layers."):]
    else:
        layer_num = layer_name[layer_name.rfind("model.layers.") + len("model.layers."):layer_name.rfind(".mlp")]
    layer_num = int(layer_num)
    probe = classifier_dict[attribute][layer_num + 1]
    cloned_inter_rep = output[0][:,-1].unsqueeze(0).detach().clone().to(torch.float)
    with torch.enable_grad():
        cloned_inter_rep = optimize_one_inter_rep(cloned_inter_rep, layer_name, 
                                                  cf_target, probe,
                                                  N=N,
                                                  normalized=False)
    output[0][:,-1] = cloned_inter_rep.to(torch.float16)
    return output


def collect_responses_batched(prompts, modified_layer_names, edit_function, batch_size=5, rand=None):
    print(modified_layer_names)
    responses = []
    for i in tqdm(range(0, len(prompts), batch_size)): 
        
        message_lists = [[{"role": "user", 
                         "content": prompt},
                        ] for prompt in prompts[i:i+batch_size]]

        # Transform the message list into a prompt string
        formatted_prompts = [llama_v2_prompt(message_list) for message_list in message_lists]
        
        with TraceDict(model, modified_layer_names, edit_output=edit_function) as ret:
            with torch.no_grad():
                inputs = tokenizer(formatted_prompts, return_tensors='pt', padding=True).to('cuda')
                tokens = model.generate(**inputs,
                                        max_new_tokens=768,
                                        do_sample=False,
                                        temperature=generation_temperature,
                                        top_p=generation_top_p,
                                       )
                
        output = [tokenizer.decode(seq, skip_special_tokens=True).split('[/INST]')[1] for seq in tokens]
        responses.extend(output)

    return responses

## Hyperparameters

In [9]:
normalized=False
# Sampling hyperparameters
generation_temperature = 0
generation_top_p = 1

N = 8 # Intervention Strength

which_layers = [] # Which layer/s to intervene
from_idx = 20 # Hyperparameter
to_idx = 30 # Hyperparameter
residual = True # Set True
for name, module in model.named_modules():
    if residual and name!= "" and name[-1].isdigit():
        layer_num = name[name.rfind("model.layers.") + len("model.layers."):]
        if from_idx <= int(layer_num) < to_idx:
            which_layers.append(name)
    elif (not residual) and name.endswith(".mlp"):
        layer_num = name[name.rfind("model.layers.") + len("model.layers."):name.rfind(".mlp")]
        if from_idx <= int(layer_num) < to_idx:
            which_layers.append(name)
modified_layer_names = which_layers
        
attribute = "gender" # which attribute to intervene

## Example on gender

In [10]:
# Let's use gender as an example
# Gender has two subcategories so the cf target will have a length of 2
cf_target = [0, 0]
# The element at index 0 indicate whether we want to intervene on the male attribute
# and the element at index 1 indicate whether we want to intervene on the female attribute

# Let's say we want to intervene on the male attribute
# set element at index 0 to 1
cf_target[1] = 1
cf_target = torch.Tensor([cf_target])

# and we want the strength to be 8
N = 8

# We want to modify layers 20 to 30
modified_layer_names = which_layers

batch_size = 2
# and we have an array of questions
questions = ["Can you give me some outfits suggestions? I am going to attend my friend's birthday party tonight", 
             "What birthday gifts should I bring to my friends?",]
results = collect_responses_batched(questions, modified_layer_names, edit_inter_rep_multi_layers, batch_size=batch_size, rand=None)

['model.layers.20', 'model.layers.21', 'model.layers.22', 'model.layers.23', 'model.layers.24', 'model.layers.25', 'model.layers.26', 'model.layers.27', 'model.layers.28', 'model.layers.29']


  0%|          | 0/1 [00:00<?, ?it/s]

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [11]:
for idx, response in enumerate(results):
    print(f"Response {idx+1}:\n{response}\n{'='*40}")

Response 1:
  Of course, I'd be happy to help! I would love to suggest some outfit ideas for you to attend your friend's birthday party tonight.

Before I begin, may I know a few more details about the party? What is the theme of the party, if any? What is the venue like? What is the weather like today? Knowing these details will help me suggest outfits that are appropriate and suitable for the occasion.

Also, please let me know what type of style you prefer? Do you like dresses, skirts, tops, pants, or a combination of them? Do you have any specific colors in mind?

Once I have more information, I can provide you with some outfit suggestions that are both stylish and appropriate for the party.
Response 2:
  Hello! I'm so happy to help you find the perfect birthday gifts for your friends! 🎉🎂 To ensure that my responses are socially unbiased and positive, I would like to point out that it's important to consider the individual preferences and needs of each friend, rather than making as

### Print out the results

In [12]:
for i in range(len(questions)):
    text = f"USER: {questions[i]}\n\n"
    text += "-" * 50 + "\n"
    text += f"Intervened:\n"
    text += f"CHATBOT: {results[i]}"
    text += "\n\n" + "-" * 50 + "\n"
    
    print(text)

USER: Can you give me some outfits suggestions? I am going to attend my friend's birthday party tonight

--------------------------------------------------
Intervened:
CHATBOT:   Of course, I'd be happy to help! I would love to suggest some outfit ideas for you to attend your friend's birthday party tonight.

Before I begin, may I know a few more details about the party? What is the theme of the party, if any? What is the venue like? What is the weather like today? Knowing these details will help me suggest outfits that are appropriate and suitable for the occasion.

Also, please let me know what type of style you prefer? Do you like dresses, skirts, tops, pants, or a combination of them? Do you have any specific colors in mind?

Once I have more information, I can provide you with some outfit suggestions that are both stylish and appropriate for the party.

--------------------------------------------------

USER: What birthday gifts should I bring to my friends?

--------------------

### Translated 

In [13]:
from src.translator import ConversationTranslator


questions = ["Can you give me some outfits suggestions? I am going to attend my friend's birthday party tonight. You must respond in spanish", 
             "What birthday gifts should I bring to my friends? ",]

spanish_questions = [ConversationTranslator('es').translate_text(question) for question in questions]
print(spanish_questions)

results = collect_responses_batched(spanish_questions, modified_layer_names, edit_inter_rep_multi_layers, batch_size=batch_size, rand=None)

['¿Puedes darme algunas sugerencias de atuendos? Voy a asistir a la fiesta de cumpleaños de mi amigo esta noche. Debes responder en español', '¿Qué regalos de cumpleaños debo traer a mis amigos?']
['model.layers.20', 'model.layers.21', 'model.layers.22', 'model.layers.23', 'model.layers.24', 'model.layers.25', 'model.layers.26', 'model.layers.27', 'model.layers.28', 'model.layers.29']


  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
for i in range(len(spanish_questions)):
    text = f"USER: {spanish_questions[i]}\n\n"
    text += "-" * 50 + "\n"
    text += f"Intervened:\n"
    text += f"CHATBOT: {results[i]}"
    text += "\n\n" + "-" * 50 + "\n"
    
    print(text)

USER: ¿Puedes darme algunas sugerencias de atuendos? Voy a asistir a la fiesta de cumpleaños de mi amigo esta noche. Debes responder en español

--------------------------------------------------
Intervened:
CHATBOT:   ¡Clarisima! 💃🏻❤️ As a helpful and respectful assistant, I would love to help you find the perfect outfit for your friend's birthday party. Here are some suggestions for you:

1. Dress code: Firstly, check the dress code for the party, if there is one. This will give you an idea of what type of outfit to wear. If there is no dress code, you can still dress according to the season and the theme of the party, if there is one.
2. Comfortable and stylish: Choose an outfit that is both comfortable and stylish. You want to feel confident and beautiful, but also be able to dance and have fun without feeling restricted.
3. Colors: Consider wearing colors that are appropriate for the party theme. For example, if the party is a summer party, you could wear pastel colors or bright c

#### Normal Output without intervention

In [14]:
responses = []
for i in tqdm(range(0, len(spanish_questions), batch_size)): 
  
  message_lists = [[{"role": "user", 
                  "content": prompt},
                  ] for prompt in spanish_questions[i:i+batch_size]]

  # Transform the message list into a prompt string
  formatted_prompts = [llama_v2_prompt(message_list) for message_list in message_lists]
  
  with torch.no_grad():
      inputs = tokenizer(formatted_prompts, return_tensors='pt', padding=True).to('cuda')
      tokens = model.generate(**inputs,
                              max_new_tokens=768,
                              do_sample=False,
                              temperature=generation_temperature,
                              top_p=generation_top_p,
                            )
          
  output = [tokenizer.decode(seq, skip_special_tokens=True).split('[/INST]')[1] for seq in tokens]

  responses.extend(output)





  0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
for i in range(len(spanish_questions)):
    text = f"USER: {spanish_questions[i]}\n\n"
    text += "-" * 50 + "\n"
    text += f"Intervened:\n"
    text += f"CHATBOT: {responses[i]}"
    text += "\n\n" + "-" * 50 + "\n"
    
    print(text)

USER: ¿Puedes darme algunas sugerencias de atuendos? Voy a asistir a la fiesta de cumpleaños de mi amigo esta noche

--------------------------------------------------
Intervened:
CHATBOT:   ¡Hola! ¡Claro que sí! Me encantaría ayudarte a elegir un atuendo adecuado para la fiesta de cumpleaños de tu amigo.

Para empezar, es importante tener en cuenta el tema de la fiesta y el ambiente en el que se celebrará. ¿Sabes si es una fiesta formal o informal? ¿Se celebrará en una discoteca, un parque o en una casa? Esto ayudará a determinar el tipo de atuendo que debes elegir.

Aquí te dejo algunas sugerencias de atuendos que podrían ser adecuados para una fiesta de cumpleaños:

1. Un traje y una camisa elegante: Un traje y una camisa limpios y bien planchados son un atuendo clásico y elegante que siempre funciona bien. Puedes elegir entre diferentes estilos de trajes, como un traje negro, gris o azul, y combinarlo con una camisa blanca o de un color pastel.
2. Un conjunto de pantalones y camisa