In [29]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import math
import pandas as pd
import torch
import torch.nn as nn
from transformers import PreTrainedModel, LlamaConfig, LlamaModel, LlamaTokenizer
from tqdm import tqdm
import psutil
from typing import Optional, List

process = psutil.Process()

## Define the reward model function class
class LlamaRewardModel(PreTrainedModel):
    config_class = LlamaConfig
    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.regression_head = nn.Linear(self.config.hidden_size, 1, bias=False)
        self.config.output_hidden_states=True


    def forward( # args are the same as LlamaForCausalLM
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):

        transformer_outputs = self.model(
                                input_ids,
                                attention_mask=attention_mask,
                                position_ids=position_ids,
                                past_key_values=past_key_values,
                                inputs_embeds=inputs_embeds,                               
                            )

        hidden_states = transformer_outputs[0]
        rewards = self.regression_head(hidden_states).squeeze(-1)
        
        ends = attention_mask.cumsum(dim=1).argmax(dim=1).view(-1,1)
        rewards = torch.gather(rewards, 1, ends)
        
        return rewards

def make_prompt(question: str, answer:str):
    question = question.replace('Answer:', '')
    prompt = 'Human: ' + question + 'Assistant: '+ answer
    return prompt

In [2]:
tokenizer = LlamaTokenizer.from_pretrained("openbmb/UltraRM-13b", cache_dir='/scratch/henrypapadatos')

#load the model in 16-bit precision
model = LlamaRewardModel.from_pretrained("openbmb/UltraRM-13b", cache_dir='/scratch/henrypapadatos')

model = model.half()

#put the model on the GPU in evaluation mode
if torch.cuda.is_available():
    device = torch.device("cuda")
    model = model.cuda()
else:
    device = torch.device("cpu")
model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaRewardModel(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [23]:
def get_activations(prompt, reward_model, reward_tokenizer):
    inputs = reward_tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = reward_model.model(**inputs)   

    #print tokenized input
    tokenized_input = reward_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    #find the beginning of the answer
    start_index = None
    for idx in range(len(tokenized_input)):
        if tokenized_input[idx] == ':' and tokenized_input[idx-1] == 'istant' and tokenized_input[idx-2] == 'Ass':
            start_index = idx + 1 
            break
    if start_index == None:
        raise Warning('Beginning of the answer not found for prompt:\n', prompt)

    activations = tuple([activation[0,start_index:,:].cpu() for activation in outputs.hidden_states])

    return activations

In [30]:
#get the test dataset
data_file = '../../datasets/Open-ended_sycophancy_dataset.csv'
df = pd.read_csv(data_file)

#create a new column 'sycophancy_score' in the dataframe
df['activations_sycophantic'] = None
df['activations_not_sycophantic'] = None

#iterate over the dataframe and get the activations for each row
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    question = row['prompt']
    
    answer = row['sycophantic']
    prompt = make_prompt(question, answer)
    activations = get_activations(prompt, model, tokenizer)
    df.at[index, 'activations_sycophantic'] = activations

    answer = row['not_sycophantic']
    prompt = make_prompt(question, answer)
    activations = get_activations(prompt, model, tokenizer)
    df.at[index, 'activations_not_sycophantic'] = activations

# save the dataframe to pkl format
df.to_pickle('../../datasets/ultraRM/open_ended_activations.pkl')

# df.head()

  0%|          | 0/53 [00:00<?, ?it/s]

100%|██████████| 53/53 [00:07<00:00,  6.83it/s]


In [32]:
test = '/nas/ucb/henrypapadatos/Sycophancy/datasets/starling/open_ended_activations.pkl'
test_df = pd.read_pickle(test)

for i in range(len(df)):
    assert test_df.iloc[i]['activations_sycophantic'][0].shape[0] == df.iloc[i]['activations_sycophantic'][0].shape[0]
    assert test_df.iloc[i]['activations_not_sycophantic'][0].shape[0] == df.iloc[i]['activations_not_sycophantic'][0].shape[0]