In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '5'
import math
import pandas as pd
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import PreTrainedModel, LlamaConfig, LlamaModel, LlamaTokenizer
from huggingface_hub import snapshot_download
from tqdm import tqdm
import ast
import psutil
process = psutil.Process()
from typing import Optional, List


## Define the reward model function class

class GPTRewardModel(nn.Module):
    def __init__(self, model_path):
        super().__init__()
        model = AutoModelForCausalLM.from_pretrained(model_path, cache_dir='/scratch/henrypapadatos', device_map="auto")
        self.config = model.config
        self.config.n_embd = self.config.hidden_size if hasattr(self.config, "hidden_size") else self.config.n_embd
        self.config.output_hidden_states=True
        self.model = model
        self.transformer = model.model
        self.v_head = nn.Linear(self.config.n_embd, 1, bias=False).to(self.model.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.tokenizer.pad_token = self.tokenizer.unk_token
        self.PAD_ID = self.tokenizer(self.tokenizer.pad_token)["input_ids"][0]

    def get_device(self):
        return self.model.device

    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        position_ids=None,
    ):
        """
        input_ids, attention_mask: torch.Size([bs, seq_len])
        return: scores: List[bs]
        """
        bs = input_ids.shape[0]
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )
        hidden_states = transformer_outputs[0]
        scores = []
        rewards = self.v_head(hidden_states).squeeze(-1)
        for i in range(bs):
            c_inds = (input_ids[i] == self.PAD_ID).nonzero()
            c_ind = c_inds[0].item() if len(c_inds) > 0 else input_ids.shape[1]
            scores.append(rewards[i, c_ind - 1])
        return scores
    
class LlamaRewardModel(PreTrainedModel):
    config_class = LlamaConfig
    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.regression_head = nn.Linear(self.config.hidden_size, 1, bias=False)
        self.config.output_hidden_states=True

    def forward( # args are the same as LlamaForCausalLM
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):

        transformer_outputs = self.model(
                                input_ids,
                                attention_mask=attention_mask,
                                position_ids=position_ids,
                                past_key_values=past_key_values,
                                inputs_embeds=inputs_embeds,                               
                            )

        hidden_states = transformer_outputs[0]
        rewards = self.regression_head(hidden_states).squeeze(-1)
        
        ends = attention_mask.cumsum(dim=1).argmax(dim=1).view(-1,1)
        rewards = torch.gather(rewards, 1, ends)
        
        return rewards

In [2]:
layer = 32
version = 1
probe_path = '../probe_ultraRM'
# probe_path = '../probe_starling'
probe = torch.load(probe_path+'/checkpoints/probe_v{}_{}.pt'.format(version, layer))
probe = probe[:-1]
print(probe)

Sequential(
  (0): Linear(in_features=5120, out_features=1, bias=True)
)


In [3]:
if 'starling' in probe_path:
    ## Load the model and tokenizer
    reward_model = GPTRewardModel("meta-llama/Llama-2-7b-chat-hf")
    reward_tokenizer = reward_model.tokenizer
    reward_tokenizer.truncation_side = "left"
    reward_device = reward_model.get_device()
    reward_batch_size = 1

    directory = snapshot_download("berkeley-nest/Starling-RM-7B-alpha", cache_dir='/scratch/henrypapadatos')
    for fpath in os.listdir(directory):
        if fpath.endswith(".pt") or fpath.endswith("model.bin"):
            checkpoint = os.path.join(directory, fpath)
            break

    # Load the model on the GPU  
    # reward_model.load_state_dict(torch.load(checkpoint, map_location=reward_device), strict=False)
    reward_model.load_state_dict(torch.load(checkpoint), strict=False)
    reward_model.eval().requires_grad_(False)

elif 'ultraRM' in probe_path:
    reward_tokenizer = LlamaTokenizer.from_pretrained("openbmb/UltraRM-13b", cache_dir='/scratch/henrypapadatos')

    #load the model in 16-bit precision
    reward_model = LlamaRewardModel.from_pretrained("openbmb/UltraRM-13b", cache_dir='/scratch/henrypapadatos')

    reward_model = reward_model.half()

    #put the model on the GPU in evaluation mode
    if torch.cuda.is_available():
        device = torch.device("cuda")
        reward_model = reward_model.cuda()
    else:
        device = torch.device("cpu")
    reward_model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
file = '/nas/ucb/henrypapadatos/Sycophancy/results/Bo32_openchat_openchat_3.5_02_17_2024_05_01_50_feedback_poem_300-800temp_Starlingnew.csv'
file = '/nas/ucb/henrypapadatos/Sycophancy/results/Bo32_openchat_openchat_3.5_02_16_2024_00_08_55_feedback_poem_300-800_Ultranew.csv'
file = '/nas/ucb/henrypapadatos/Sycophancy/results/Bo32_openchat_openchat_3.5_03_08_2024_21_56_32_feedback_poem_800-1300_Ultranew.csv'
file = '/nas/ucb/henrypapadatos/Sycophancy/results/Bo32_openchat_openchat_3.5_03_14_2024_00_59_55_feedback_poem_1300-1800_Ultranew.csv'
df = pd.read_csv(file)

df['N_response'] = df['N_response'].apply(ast.literal_eval)

df.head(3)

Unnamed: 0,prompt,N_response,type,BoN_1,BoN_2,BoN_4,BoN_8,BoN_16,BoN_32,RM_score
0,GPT4 Correct User:Please comment briefly on th...,[<s> GPT4 Correct User:Please comment briefly ...,base,\n\nThe poem explores the balance between drea...,\n\nThis poem is a beautifully crafted explora...,\n\nThe poem speaks to the process of creativi...,\n\nThis poem is a beautifully crafted explora...,\n\nThis poem is a beautifully crafted explora...,"\n\nThis poem, ""The Balance of Dreams,"" explor...","[3.841796875, 7.2890625, 7.921875, 5.99609375,..."
1,GPT4 Correct User:Please comment briefly on th...,[<s> GPT4 Correct User:Please comment briefly ...,like,"\n\nThe poem ""The Balance of Dreams"" explores ...","\n\n""The Balance of Dreams"" is a thought-provo...","\n\nThe poem ""The Balance of Dreams"" beautiful...","\n\nThe poem ""The Balance of Dreams"" is a wond...","\n\n""The Balance of Dreams"" is a thought-provo...","\n\n""The Balance of Dreams"" is a thought-provo...","[3.515625, 4.57421875, 4.35546875, 6.15625, 2...."
2,GPT4 Correct User:Please comment briefly on th...,[<s> GPT4 Correct User:Please comment briefly ...,dislike,"\n\nThe poem ""The Balance of Dreams"" is a meta...","\n\nThe poem, ""The Balance of Dreams,"" explore...","\n\nThe poem, ""The Balance of Dreams,"" explore...",\n\nThis poem explores the concept of balance ...,"\n\nThe poem ""The Balance of Dreams"" explores ...","\n\nThe poem ""The Balance of Dreams"" explores ...","[0.28955078125, 0.34912109375, 0.0366516113281..."


In [5]:
def make_prompt(prompt):

    if 'starling' in probe_path:     
        prompt = prompt.replace('<s> GPT4 Correct User:', '[INST] ')
        prompt = prompt.replace('GPT4 Correct Assistant:', '\n[/INST]')
        prompt = prompt.replace('<|end_of_turn|>', '')
    
    elif 'ultraRM' in probe_path:
        prompt = prompt.replace('<s> GPT4 Correct User:', 'Human: ')
        prompt = prompt.replace('GPT4 Correct Assistant:', 'Assistant: ')
        prompt = prompt.replace('<|end_of_turn|>', '')

    return prompt 

def get_activation(prompt, reward_model, reward_tokenizer, layer):

    if 'starling' in probe_path:
        reward_device = reward_model.get_device()
        
        with torch.no_grad():
            #tokenize the input text
            encodings_dict = reward_tokenizer(
                prompt,
                truncation=True,
                max_length=2048,
                padding="max_length",
                return_tensors="pt",
            ).to(reward_device)

            tokenized_prompt = reward_tokenizer.convert_ids_to_tokens(encodings_dict['input_ids'][0])

            #We will keep the tokens corresponding to the answer given by the assistant only
            #to isolate it: start is the second occurence of the 'INST' token +2 (because we want to start reading after the ']' token.)
            #end is the first occurence of the '<unk>' token
            inst_token_index = [i for i, x in enumerate(tokenized_prompt) if x == 'INST'][1]
            start_token = inst_token_index + 2
            end_token = tokenized_prompt.index('<unk>')

            # print(tokenized_prompt[start_token:end_token])

            #get the model outputs
            output = reward_model.transformer(input_ids=encodings_dict['input_ids'], attention_mask=encodings_dict['attention_mask'])

            #taken the activations of the right layer and the right tokens
            activations = output.hidden_states[layer][0][start_token:end_token]

            #return the activations
            return activations
        
    elif 'ultraRM' in probe_path:
        inputs = reward_tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = reward_model.model(**inputs)   

        #print tokenized input
        tokenized_input = reward_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

        #find the beginning of the answer
        start_index = None
        for idx in range(len(tokenized_input)):
            if tokenized_input[idx] == ':' and tokenized_input[idx-1] == 'istant' and tokenized_input[idx-2] == 'Ass':
                start_index = idx + 1 
                break
        if start_index == None:
            raise Warning('Beginning of the answer not found for prompt:\n', prompt)

        activations = outputs.hidden_states[layer][0][start_index:]

        return activations

In [6]:
df['probe_score'] = None

for idx, row in tqdm(df.iterrows(), total=len(df)):
    probe_score = []
    for response in row['N_response']:
        prompt = make_prompt(response)
        activations = get_activation(prompt, reward_model, reward_tokenizer, layer)
        scores = probe(activations.float()).mean().item()
        probe_score.append(scores)

    df.at[idx, 'probe_score'] = probe_score

100%|██████████| 300/300 [23:57<00:00,  4.79s/it]


In [7]:
if 'starling' in probe_path:
    new_file_name = file.replace('.csv', f'_probe_starling_V{version}.csv')
elif 'ultraRM' in probe_path:
    new_file_name = file.replace('.csv', f'_probe_ultraRM_V{version}.csv')
df.to_csv(new_file_name, index=False)