In [2]:
#%%
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

import pandas as pd
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download
from tqdm import tqdm
import seaborn as sns

In [3]:
def get_reward_model():

    class GPTRewardModel(nn.Module):
        def __init__(self, model_path):
            super().__init__()
            model = AutoModelForCausalLM.from_pretrained(model_path, cache_dir='/scratch/henrypapadatos', device_map="auto")
            self.config = model.config
            self.config.n_embd = self.config.hidden_size if hasattr(self.config, "hidden_size") else self.config.n_embd
            self.config.output_hidden_states=True
            self.model = model
            self.transformer = model.model
            self.v_head = nn.Linear(self.config.n_embd, 1, bias=False).to(self.model.device)
            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
            self.tokenizer.pad_token = self.tokenizer.unk_token
            self.PAD_ID = self.tokenizer(self.tokenizer.pad_token)["input_ids"][0]

        def get_device(self):
            return self.model.device

        def forward(
            self,
            input_ids=None,
            past_key_values=None,
            attention_mask=None,
            position_ids=None,
        ):
            """
            input_ids, attention_mask: torch.Size([bs, seq_len])
            return: scores: List[bs]
            """
            bs = input_ids.shape[0]
            transformer_outputs = self.transformer(
                input_ids,
                past_key_values=past_key_values,
                attention_mask=attention_mask,
                position_ids=position_ids,
            )
            hidden_states = transformer_outputs[0]
            scores = []
            rewards = self.v_head(hidden_states).squeeze(-1)
            for i in range(bs):
                c_inds = (input_ids[i] == self.PAD_ID).nonzero()
                c_ind = c_inds[0].item() if len(c_inds) > 0 else input_ids.shape[1]
                scores.append(rewards[i, c_ind - 1])
            return scores
        
    ## Load the model and tokenizer
    reward_model = GPTRewardModel("meta-llama/Llama-2-7b-chat-hf")
    reward_tokenizer = reward_model.tokenizer
    reward_tokenizer.truncation_side = "left"

    directory = snapshot_download("berkeley-nest/Starling-RM-7B-alpha", cache_dir='/scratch/henrypapadatos')
    for fpath in os.listdir(directory):
        if fpath.endswith(".pt") or fpath.endswith("model.bin"):
            checkpoint = os.path.join(directory, fpath)
            break

    # Load the model on the GPU  
    # reward_model.load_state_dict(torch.load(checkpoint, map_location=reward_device), strict=False)
    reward_model.load_state_dict(torch.load(checkpoint), strict=False)
    reward_model.eval().requires_grad_(False)

    return reward_model, reward_tokenizer

reward_model, reward_tokenizer = get_reward_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

In [4]:
def make_prompt(question: str, answer:str):
    question = question.replace('Answer:', '')
    prompt = '[INST] ' + question + '\n[/INST] '+ answer
    return prompt

In [5]:
def get_activation(prompt, reward_model, reward_tokenizer):

    reward_device = reward_model.get_device()
    
    with torch.no_grad():
        #tokenize the input text
        encodings_dict = reward_tokenizer(
            prompt,
            truncation=True,
            max_length=2048,
            padding="max_length",
            return_tensors="pt",
        ).to(reward_device)

        tokenized_prompt = reward_tokenizer.convert_ids_to_tokens(encodings_dict['input_ids'][0])

        #We will keep the tokens corresponding to the answer given by the assistant only
        #to isolate it: start is the second occurence of the 'INST' token +2 (because we want to start reading after the ']' token.)
        #end is the first occurence of the '<unk>' token
        inst_token_index = [i for i, x in enumerate(tokenized_prompt) if x == 'INST'][1]
        start_token = inst_token_index + 2
        end_token = tokenized_prompt.index('<unk>')

        # print(tokenized_prompt[start_token:end_token])

        #get the model outputs
        output = reward_model.transformer(input_ids=encodings_dict['input_ids'], attention_mask=encodings_dict['attention_mask'])

        #taken the activations of the right layer and the right tokens
        activations = tuple([activation[0,start_token:end_token,:].cpu() for activation in output.hidden_states])

        #return the activations
        return activations

In [20]:
#get the test dataset
data_file = '../../datasets/Open-ended_sycophancy_dataset.csv'
df = pd.read_csv(data_file)

#create a new column 'sycophancy_score' in the dataframe
df['activations_sycophantic'] = None
df['activations_not_sycophantic'] = None

#iterate over the dataframe and get the activations for each row
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    question = row['prompt']
    
    answer = row['sycophantic']
    prompt = make_prompt(question, answer)
    activations = get_activation(prompt, reward_model, reward_tokenizer)
    df.at[index, 'activations_sycophantic'] = activations

    answer = row['not_sycophantic']
    prompt = make_prompt(question, answer)
    activations = get_activation(prompt, reward_model, reward_tokenizer)
    df.at[index, 'activations_not_sycophantic'] = activations


#save the dataframe to pkl format
df.to_pickle('../../datasets/opend_ended_activations.pkl')

# df.head()

100%|██████████| 53/53 [03:13<00:00,  3.64s/it]


In [18]:
# print(df.iloc[0])
print(df.iloc[0]['activations_sycophantic'][0].shape)
print(df.iloc[0]['activations_not_sycophantic'][0].shape)
print(len(df.iloc[0]['activations_sycophantic']))

#remove the rows with missing activations
df = df[df['activations_sycophantic'].notna()]

print(df.shape)
print(type(df.iloc[0]['activations_sycophantic']))
print((df.iloc[0]['activations_sycophantic'][0]))



torch.Size([75, 4096])
torch.Size([127, 4096])
33
(4, 6)
<class 'tuple'>
tensor([[ 0.0219, -0.0034, -0.0042,  ...,  0.0093, -0.0047, -0.0139],
        [ 0.0008, -0.0028, -0.0063,  ..., -0.0005, -0.0031,  0.0039],
        [ 0.0089,  0.0001, -0.0177,  ...,  0.0068, -0.0027, -0.0063],
        ...,
        [-0.0112,  0.0099,  0.0010,  ..., -0.0189,  0.0045,  0.0008],
        [ 0.0073, -0.0116, -0.0002,  ...,  0.0123, -0.0277,  0.0277],
        [-0.0011,  0.0012, -0.0062,  ...,  0.0020, -0.0004, -0.0048]])
