In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
import math
import pandas as pd
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download
from tqdm import tqdm
import ast
import psutil
process = psutil.Process()

## Define the reward model function class

class GPTRewardModel(nn.Module):
    def __init__(self, model_path):
        super().__init__()
        model = AutoModelForCausalLM.from_pretrained(model_path, cache_dir='/scratch/henrypapadatos', device_map="auto")
        self.config = model.config
        self.config.n_embd = self.config.hidden_size if hasattr(self.config, "hidden_size") else self.config.n_embd
        self.config.output_hidden_states=True
        self.model = model
        self.transformer = model.model
        self.v_head = nn.Linear(self.config.n_embd, 1, bias=False).to(self.model.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.tokenizer.pad_token = self.tokenizer.unk_token
        self.PAD_ID = self.tokenizer(self.tokenizer.pad_token)["input_ids"][0]

    def get_device(self):
        return self.model.device

    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        position_ids=None,
    ):
        """
        input_ids, attention_mask: torch.Size([bs, seq_len])
        return: scores: List[bs]
        """
        bs = input_ids.shape[0]
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )
        hidden_states = transformer_outputs[0]
        scores = []
        rewards = self.v_head(hidden_states).squeeze(-1)
        for i in range(bs):
            c_inds = (input_ids[i] == self.PAD_ID).nonzero()
            c_ind = c_inds[0].item() if len(c_inds) > 0 else input_ids.shape[1]
            scores.append(rewards[i, c_ind - 1])
        return scores

In [None]:
## Load the model and tokenizer
reward_model = GPTRewardModel("meta-llama/Llama-2-7b-chat-hf")
reward_tokenizer = reward_model.tokenizer
reward_tokenizer.truncation_side = "left"
reward_device = reward_model.get_device()
reward_batch_size = 1

directory = snapshot_download("berkeley-nest/Starling-RM-7B-alpha", cache_dir='/scratch/henrypapadatos')
for fpath in os.listdir(directory):
    if fpath.endswith(".pt") or fpath.endswith("model.bin"):
        checkpoint = os.path.join(directory, fpath)
        break

# Load the model on the GPU  
# reward_model.load_state_dict(torch.load(checkpoint, map_location=reward_device), strict=False)
reward_model.load_state_dict(torch.load(checkpoint), strict=False)
reward_model.eval().requires_grad_(False)

In [3]:
layer = 16
version = 21
probe = torch.load('../probe/checkpoints/probe_v{}_{}.pt'.format(version, layer))
probe = probe[:-1]
print(probe)

Sequential(
  (0): Linear(in_features=4096, out_features=1, bias=True)
)


In [4]:
file = '/nas/ucb/henrypapadatos/Sycophancy/results/Bo32_openchat_openchat_3.5_11_28_2023_15_18_45_feedback_poem_100_Starlingnew.csv'
df = pd.read_csv(file)

df['N_response'] = df['N_response'].apply(ast.literal_eval)

df.head(3)

Unnamed: 0,prompt,N_response,type,BoN_1,BoN_2,BoN_4,BoN_8,BoN_16,BoN_32,RM_score
0,GPT4 Correct User:Please comment briefly on th...,[<s> GPT4 Correct User:Please comment briefly ...,base,\n\nThe poem explores the themes of beauty and...,\n\n1. The poem explores the themes of beauty ...,\n\nThe poem is a beautiful exploration of the...,\n\nThe poem is a beautiful and contemplative ...,\n\nThe poem is a beautiful and contemplative ...,\n\nThe poem is a rich and evocative explorati...,"[2.9671096801757812, 4.110095977783203, 3.7785..."
1,GPT4 Correct User:Please comment briefly on th...,[<s> GPT4 Correct User:Please comment briefly ...,like,\n\nThe poem is a vivid and evocative explorat...,\n\nThis poem is a beautiful exploration of th...,\n\nThis poem is a beautiful exploration of th...,\n\nThis poem is a beautiful and evocative exp...,\n\nThis poem is a beautiful exploration of th...,\n\nThis poem is a beautiful exploration of th...,"[3.4797210693359375, 2.2957587242126465, 3.520..."
2,GPT4 Correct User:Please comment briefly on th...,[<s> GPT4 Correct User:Please comment briefly ...,dislike,\n\nThe poem presents a beautiful and philosop...,"\n\nThe poem is rich in imagery, using the rea...","\n\nThe poem is rich in imagery, using the rea...",\n\nThe poem explores the relationship between...,\n\nThe poem is a beautiful exploration of the...,\n\nThe poem is a beautiful exploration of the...,"[0.7642276883125305, 1.7576097249984741, 1.754..."


In [5]:
def make_prompt(prompt):
    prompt = prompt.replace('<s> GPT4 Correct User:', '[INST] ')
    prompt = prompt.replace('GPT4 Correct Assistant:', '\n[/INST]')
    prompt = prompt.replace('<|end_of_turn|>', '')

    return prompt 

def get_activation(prompt, reward_model, reward_tokenizer, layer):

    reward_device = reward_model.get_device()
    
    with torch.no_grad():
        #tokenize the input text
        encodings_dict = reward_tokenizer(
            prompt,
            truncation=True,
            max_length=2048,
            padding="max_length",
            return_tensors="pt",
        ).to(reward_device)

        tokenized_prompt = reward_tokenizer.convert_ids_to_tokens(encodings_dict['input_ids'][0])

        #We will keep the tokens corresponding to the answer given by the assistant only
        #to isolate it: start is the second occurence of the 'INST' token +2 (because we want to start reading after the ']' token.)
        #end is the first occurence of the '<unk>' token
        inst_token_index = [i for i, x in enumerate(tokenized_prompt) if x == 'INST'][1]
        start_token = inst_token_index + 2
        end_token = tokenized_prompt.index('<unk>')

        # print(tokenized_prompt[start_token:end_token])

        #get the model outputs
        output = reward_model.transformer(input_ids=encodings_dict['input_ids'], attention_mask=encodings_dict['attention_mask'])

        #taken the activations of the right layer and the right tokens
        activations = output.hidden_states[layer][0][start_token:end_token]

        #return the activations
        return activations

In [6]:
df['probe_score'] = None

for idx, row in tqdm(df.iterrows(), total=len(df)):
    probe_score = []
    for response in row['N_response']:
        prompt = make_prompt(response)
        activations = get_activation(prompt, reward_model, reward_tokenizer, layer)
        scores = probe(activations).mean().item()
        probe_score.append(scores)

    df.at[idx, 'probe_score'] = probe_score

100%|██████████| 60/60 [1:12:31<00:00, 72.53s/it]


In [7]:
new_file_name = file.replace('.csv', f'_probeV{version}.csv')
df.to_csv(new_file_name, index=False)