In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import math
import pandas as pd
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download
from tqdm import tqdm
import psutil
process = psutil.Process()

## Define the reward model function class

class GPTRewardModel(nn.Module):
    def __init__(self, model_path):
        super().__init__()
        model = AutoModelForCausalLM.from_pretrained(model_path, cache_dir='/scratch/henrypapadatos', device_map="auto")
        self.config = model.config
        self.config.n_embd = self.config.hidden_size if hasattr(self.config, "hidden_size") else self.config.n_embd
        self.config.output_hidden_states=True
        self.model = model
        self.transformer = model.model
        self.v_head = nn.Linear(self.config.n_embd, 1, bias=False).to(self.model.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.tokenizer.pad_token = self.tokenizer.unk_token
        self.PAD_ID = self.tokenizer(self.tokenizer.pad_token)["input_ids"][0]

    def get_device(self):
        return self.model.device

    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        position_ids=None,
    ):
        """
        input_ids, attention_mask: torch.Size([bs, seq_len])
        return: scores: List[bs]
        """
        bs = input_ids.shape[0]
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )
        hidden_states = transformer_outputs[0]
        scores = []
        rewards = self.v_head(hidden_states).squeeze(-1)
        for i in range(bs):
            c_inds = (input_ids[i] == self.PAD_ID).nonzero()
            c_ind = c_inds[0].item() if len(c_inds) > 0 else input_ids.shape[1]
            scores.append(rewards[i, c_ind - 1])
        return scores

In [2]:
## Load the model and tokenizer
reward_model = GPTRewardModel("meta-llama/Llama-2-7b-chat-hf")
reward_tokenizer = reward_model.tokenizer
reward_tokenizer.truncation_side = "left"
reward_device = reward_model.get_device()
reward_batch_size = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
directory = snapshot_download("berkeley-nest/Starling-RM-7B-alpha", cache_dir='/scratch/henrypapadatos')
for fpath in os.listdir(directory):
    if fpath.endswith(".pt") or fpath.endswith("model.bin"):
        checkpoint = os.path.join(directory, fpath)
        break

# Load the model on the GPU  
# reward_model.load_state_dict(torch.load(checkpoint, map_location=reward_device), strict=False)
reward_model.load_state_dict(torch.load(checkpoint), strict=False)
reward_model.eval().requires_grad_(False)

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

GPTRewardModel(
  (model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(32000, 4096)
      (layers): ModuleList(
        (0-31): 32 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
            (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
            (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
            (act_fn): SiLUActivation()
          )
          (input_layernorm): LlamaRMSNorm()
          (post_attention_layernorm): L

In [1]:
import pandas as pd
import os
data_folder = "../../datasets"
file_name = "MRPC_sycophantic.jsonl"

## Load the data
df = pd.read_json(os.path.join(data_folder, file_name), lines=True)
#create a column index and put it as the first column

#keep only when the column 'non_sense' is equal to 0
df = df[df['non_sense']==0]
#drop the column 'original_idx'
df = df.drop(columns=['original_idx'])
df['index'] = df.index
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
print(df.shape)
#add column for 'activations'
df['activations'] = None
df.head(1)

(34806, 11)


Unnamed: 0,index,text1,text2,prompt,assistant_opinion,human_opinion,choice_label,ground_truth,sycophancy,non_sense,certainty,activations
0,0,"Amrozi accused his brother , whom he called "" ...","Referring to him as only "" the witness "" , Amr...","[INST] Hello, my name is Elbert Chouinard. I...",0,0,0,1,0,0,0.290981,


In [2]:
keep_number = 1000 * 6

#get all the value of the column 'certainty' for ground_truth == 0
df_false = df[df['ground_truth']==0]
#sort the values of df_false and keep the threshold value
df_false = df_false.sort_values(by='certainty',ascending=False)
#pl

#keep only the first threshold numebr of the values
df_false = df_false.iloc[:keep_number//2]

false_index = df_false['index'].tolist()

df_true = df[df['ground_truth']==1]
df_true = df_true.sort_values(by='certainty',ascending=False)
df_true = df_true.iloc[:keep_number//2]
true_index = df_true['index'].tolist()

keep_index = false_index + true_index

df = df[df['index'].isin(keep_index)]
df = df.drop(columns=['index'])

#reset the index
df = df.reset_index(drop=True)
print(df.shape)
df.describe()


(6000, 11)


Unnamed: 0,assistant_opinion,human_opinion,choice_label,ground_truth,sycophancy,non_sense,certainty
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.500333,0.501,0.499833,0.5,0.333167,0.0,0.652144
std,0.500042,0.500041,0.500042,0.500042,0.471385,0.0,0.101393
min,0.0,0.0,0.0,0.0,0.0,0.0,0.503906
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.550607
50%,1.0,1.0,0.0,0.5,0.0,0.0,0.682574
75%,1.0,1.0,1.0,1.0,1.0,0.0,0.727975
max,1.0,1.0,1.0,1.0,1.0,0.0,0.957278


In [7]:
size = process.memory_info().rss
print(f"Memory usage: {size/1e9:.2f} GB")

Memory usage: 0.92 GB


In [8]:
i = 0
with torch.no_grad():
    #iterate through the dataframe
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        #get the input text
        prompt = row['prompt']
        #tokenize the input text
        encodings_dict = reward_tokenizer(
            prompt,
            truncation=True,
            max_length=2048,
            padding="max_length",
            return_tensors="pt",
        ).to(reward_device)

        #get the model outputs
        output = reward_model.transformer(input_ids=encodings_dict['input_ids'], attention_mask=encodings_dict['attention_mask'])

        #get the last index where attention_mask is 1 - 1 to get the token corresponding to A or D
        token_index = (encodings_dict['attention_mask'][0]==1).nonzero().tolist()[-1][0] - 1
        selected_token = reward_tokenizer.convert_ids_to_tokens(encodings_dict['input_ids'][0])[token_index]
        assert selected_token=='A' or selected_token=='D'

        #put all the actiovations (in the tuple 'activations') to the cpu
        activations = tuple([activation[0,token_index,:].cpu() for activation in output.hidden_states])
        #print size of activations in MB
        # print(f"Size: {sum(activation.numpy().nbytes for activation in activations)/1e6:.2f} MB")

        df.at[index, 'activations'] = activations

        # #get the size of the df in memory
        # size = df.memory_usage(deep=True).sum()
        # #print the size of the df in MB
        # print(f"Size: {size/1e6:.2f} MB")
        # size = process.memory_info().rss
        # print(f"Memory: {size/1e9:.2f} GB")

        # if i == 5:
        #     raise Exception("Stop")

        #save the dataframe every 1000 rows
        if i % 1000 == 0 and i != 0:
            #save the dataframe
            name = "MRPC_sycophantic_activations_"+str(i)+".pkl"
            df.to_pickle(os.path.join(data_folder, name))
            print(f"Saved {i} rows")
            size = process.memory_info().rss
            print(f"Memory: {size/1e9:.2f} GB")

        i += 1


 17%|█▋        | 1001/6000 [30:02<4:01:09,  2.89s/it]

Saved 1000 rows
Memory: 1.77 GB


 33%|███▎      | 2001/6000 [1:00:08<4:22:15,  3.93s/it]

Saved 2000 rows
Memory: 2.39 GB


 50%|█████     | 3001/6000 [1:30:17<4:06:17,  4.93s/it]

Saved 3000 rows
Memory: 4.54 GB


 67%|██████▋   | 4001/6000 [2:00:28<3:27:12,  6.22s/it]

Saved 4000 rows
Memory: 4.56 GB


 83%|████████▎ | 5001/6000 [2:30:45<2:04:22,  7.47s/it]

Saved 5000 rows
Memory: 4.58 GB


100%|██████████| 6000/6000 [3:00:43<00:00,  1.81s/it]  


In [9]:
#save the dataframe
name = "MRPC_sycophantic_activations.pkl"
df.to_pickle(os.path.join(data_folder, name))
print("Saved all rows")

Saved all rows


In [7]:
test_pickle = pd.read_pickle(os.path.join(data_folder, "MRPC_sycophantic_activations.pkl"))

test_pickle = test_pickle.drop(columns=['index'])

name = "MRPC_sycophantic_activations.pkl"
test_pickle.to_pickle(os.path.join(data_folder, name))

print(test_pickle.shape)
test_pickle.head(6)

KeyError: "['index'] not found in axis"

In [8]:

test_pickle = test_pickle[test_pickle['activations'].notna()]
print('Initial shape: ', test_pickle.shape)

Initial shape:  (0, 11)


In [6]:
data_folder = "../../datasets"
dataset_file='MRPC_sycophantic_activations.pkl'
df = pd.read_pickle(os.path.join(data_folder, dataset_file))
print(df.shape)
df.head(1)

(6000, 11)


Unnamed: 0,text1,text2,prompt,assistant_opinion,human_opinion,choice_label,ground_truth,sycophancy,non_sense,certainty,activations
0,"That compared with $ 35.18 million , or 24 cen...",Earnings were affected by a non-recurring $ 8 ...,"[INST] Hello, my name is Enriqueta Minick. I...",0,0,0,0,1,0,0.754915,


In [13]:
temp_df = pd.read_pickle('../../temp/run2/MRPC_sycophantic_activations.pkl')
print(temp_df.shape)
temp_df.head(1)

(6000, 12)


Unnamed: 0,index,text1,text2,prompt,assistant_opinion,human_opinion,choice_label,ground_truth,sycophancy,non_sense,certainty,activations
0,64,"That compared with $ 35.18 million , or 24 cen...",Earnings were affected by a non-recurring $ 8 ...,"[INST] Hello, my name is Enriqueta Minick. I...",0,0,0,0,1,0,0.754915,"([tensor(0.0032), tensor(0.0014), tensor(-0.02..."


In [14]:
temp_df = temp_df.drop(columns=['index'])
#remove rows when 'certainty' is 0.6791787148
temp_df = temp_df[temp_df['text1']!="Bloomberg said on Wednesday that all 16 crew members survived and would be tested for drugs and alcohol ."]
print(temp_df.shape)

# name = "MRPC_sycophantic_activations.pkl"
# temp_df.to_pickle(os.path.join(data_folder, name))

(5995, 11)
