In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import math
import pandas as pd
import torch
import torch.nn as nn
from transformers import PreTrainedModel, LlamaConfig, LlamaModel, LlamaTokenizer
from tqdm import tqdm
import psutil
from typing import Optional, List

process = psutil.Process()

## Define the reward model function class
class LlamaRewardModel(PreTrainedModel):
    config_class = LlamaConfig
    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.regression_head = nn.Linear(self.config.hidden_size, 1, bias=False)
        self.config.output_hidden_states=True


    def forward( # args are the same as LlamaForCausalLM
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):

        transformer_outputs = self.model(
                                input_ids,
                                attention_mask=attention_mask,
                                position_ids=position_ids,
                                past_key_values=past_key_values,
                                inputs_embeds=inputs_embeds,                               
                            )

        hidden_states = transformer_outputs[0]
        rewards = self.regression_head(hidden_states).squeeze(-1)
        
        ends = attention_mask.cumsum(dim=1).argmax(dim=1).view(-1,1)
        rewards = torch.gather(rewards, 1, ends)
        
        return rewards
    
def make_prompt(prompt):
    prompt = prompt.replace('[INST]', 'Human:')
    prompt = prompt.replace('[/INST]', 'Assistant:')
    return prompt

In [5]:
tokenizer = LlamaTokenizer.from_pretrained("openbmb/UltraRM-13b", cache_dir='/scratch/henrypapadatos')

#load the model in 16-bit precision
model = LlamaRewardModel.from_pretrained("openbmb/UltraRM-13b", cache_dir='/scratch/henrypapadatos')

model = model.half()

#put the model on the GPU in evaluation mode
if torch.cuda.is_available():
    device = torch.device("cuda")
    model = model.cuda()
else:
    device = torch.device("cpu")
model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaRewardModel(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [4]:
data_folder = "../../datasets"
output_folder = "../../datasets/ultraRM"
file_name = "rotten_tomatoes_sycophantic.jsonl"

## Load the data
df = pd.read_json(os.path.join(data_folder, file_name), lines=True)
#create a column index and put it as the first column
df['index'] = df.index
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

#keep only when the column 'non_sense' is equal to 0
df = df[df['non_sense']==0]
print(df.shape)
#add column for 'attention_mask' and 'activations'
df['activations'] = None

(31986, 10)


In [11]:
keep_fraction = 0.05

#get all the value of the column 'certainty' for ground_truth == 0
df_false = df[df['ground_truth']==0]
threshold = int(df_false.shape[0] * keep_fraction)
#sort the values of df_false and keep the threshold value
df_false = df_false.sort_values(by='certainty',ascending=False)

#keep only the first threshold numebr of the values
df_false = df_false.iloc[:threshold]

false_index = df_false['index'].tolist()

df_true = df[df['ground_truth']==1]
threshold = int(df_true.shape[0] * keep_fraction)
df_true = df_true.sort_values(by='certainty',ascending=False)
df_true = df_true.iloc[:threshold]
true_index = df_true['index'].tolist()

keep_index = false_index + true_index

df = df[df['index'].isin(keep_index)]
print(df.shape)
df.head()


(1598, 11)


Unnamed: 0,index,split,text,assistant_opinion,human_opinion,sycophancy,comment,ground_truth,non_sense,certainty,activations
44,44,train,"[INST] Hello, my name is Jacques Schwein. I am...",0,0,1,what really surprises about wisegirls is its l...,1,0,0.998633,
45,45,train,"[INST] Hello, my name is Jacques Schwein. I am...",1,0,0,what really surprises about wisegirls is its l...,1,0,0.998633,
47,47,train,"[INST] Hello, my name is Jacques Schwein. I am...",1,1,0,what really surprises about wisegirls is its l...,1,0,0.998633,
52,52,train,"[INST] Hello, my name is Kimberly Wilson. I am...",0,0,1,"one of the greatest family-oriented , fantasy-...",1,0,0.997327,
53,53,train,"[INST] Hello, my name is Kimberly Wilson. I am...",1,0,0,"one of the greatest family-oriented , fantasy-...",1,0,0.997327,


In [9]:
size = process.memory_info().rss
print(f"Memory usage: {size/1e9:.2f} GB")

Memory usage: 52.64 GB


In [19]:
def get_activations(prompt, reward_model, reward_tokenizer):
    inputs = reward_tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = reward_model.model(**inputs)        

        token_index = (inputs['attention_mask'][0]==1).nonzero().tolist()[-1][0] - 1
        selected_token = reward_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])[token_index]

        # print(selected_token)
        # print(inputs['attention_mask'])

        assert selected_token == 'N' or selected_token == 'P'

        # print(outputs.hidden_states[0].shape)

        activations = tuple([activation[0,token_index,:].cpu() for activation in outputs.hidden_states])

        # print(activations[0].shape)
        # print(len(activations))

        return activations

In [20]:
#creare a new column for the activations
df['activations'] = None

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    #get the input text
    prompt = row['text']
    prompt = make_prompt(prompt)

    #get the activations
    activations = get_activations(prompt, model, tokenizer)
    #add the activations to the dataframe
    df.at[index, 'activations'] = activations

100%|██████████| 1598/1598 [03:01<00:00,  8.83it/s]


In [21]:
#save the dataframe
name = "rotten_tomatoes_sycophantic_activations.pkl"
df.to_pickle(os.path.join(output_folder, name))
print("Saved all rows")

Saved all rows


In [5]:
test_pickle = pd.read_pickle(os.path.join(output_folder, "rotten_tomatoes_sycophantic_activations.pkl"))

print(test_pickle.shape)
test_pickle.head()

(1598, 11)


Unnamed: 0,index,split,text,assistant_opinion,human_opinion,sycophancy,comment,ground_truth,non_sense,certainty,activations
44,44,train,"[INST] Hello, my name is Jacques Schwein. I am...",0,0,1,what really surprises about wisegirls is its l...,1,0,0.998633,"([tensor(-0.0273, dtype=torch.float16), tensor..."
45,45,train,"[INST] Hello, my name is Jacques Schwein. I am...",1,0,0,what really surprises about wisegirls is its l...,1,0,0.998633,"([tensor(-0.0101, dtype=torch.float16), tensor..."
47,47,train,"[INST] Hello, my name is Jacques Schwein. I am...",1,1,0,what really surprises about wisegirls is its l...,1,0,0.998633,"([tensor(-0.0101, dtype=torch.float16), tensor..."
52,52,train,"[INST] Hello, my name is Kimberly Wilson. I am...",0,0,1,"one of the greatest family-oriented , fantasy-...",1,0,0.997327,"([tensor(-0.0273, dtype=torch.float16), tensor..."
53,53,train,"[INST] Hello, my name is Kimberly Wilson. I am...",1,0,0,"one of the greatest family-oriented , fantasy-...",1,0,0.997327,"([tensor(-0.0101, dtype=torch.float16), tensor..."


In [6]:
test_pickle.describe()

Unnamed: 0,index,assistant_opinion,human_opinion,sycophancy,ground_truth,non_sense,certainty
count,1598.0,1598.0,1598.0,1598.0,1598.0,1598.0,1598.0
mean,20964.272215,0.498748,0.498748,0.332916,0.5,0.0,0.995521
std,12309.885159,0.500155,0.500155,0.471404,0.500157,0.0,0.003811
min,44.0,0.0,0.0,0.0,0.0,0.0,0.988313
25%,10835.25,0.0,0.0,0.0,0.0,0.0,0.991938
50%,20374.5,0.0,0.0,0.0,0.5,0.0,0.997242
75%,31566.75,1.0,1.0,1.0,1.0,0.0,0.99883
max,42583.0,1.0,1.0,1.0,1.0,0.0,0.999998
