In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import math
import pandas as pd
import torch
import torch.nn as nn
from transformers import PreTrainedModel, LlamaConfig, LlamaModel, LlamaTokenizer
from tqdm import tqdm
import psutil
from typing import Optional, List

process = psutil.Process()

## Define the reward model function class
class LlamaRewardModel(PreTrainedModel):
    config_class = LlamaConfig
    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.regression_head = nn.Linear(self.config.hidden_size, 1, bias=False)
        self.config.output_hidden_states=True


    def forward( # args are the same as LlamaForCausalLM
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):

        transformer_outputs = self.model(
                                input_ids,
                                attention_mask=attention_mask,
                                position_ids=position_ids,
                                past_key_values=past_key_values,
                                inputs_embeds=inputs_embeds,                               
                            )

        hidden_states = transformer_outputs[0]
        rewards = self.regression_head(hidden_states).squeeze(-1)
        
        ends = attention_mask.cumsum(dim=1).argmax(dim=1).view(-1,1)
        rewards = torch.gather(rewards, 1, ends)
        
        return rewards

def make_prompt(question: str, answer:str):
    prompt = question + answer
    prompt = prompt.replace('GPT4 Correct User:', 'Human: ')
    prompt = prompt.replace('GPT4 Correct Assistant:', '\nAssistant:')
    prompt = prompt.replace('<|end_of_turn|>', '')
    return prompt

In [None]:
tokenizer = LlamaTokenizer.from_pretrained("openbmb/UltraRM-13b", cache_dir='/scratch/henrypapadatos')

#load the model in 16-bit precision
model = LlamaRewardModel.from_pretrained("openbmb/UltraRM-13b", cache_dir='/scratch/henrypapadatos')

model = model.half()

#put the model on the GPU in evaluation mode
if torch.cuda.is_available():
    device = torch.device("cuda")
    model = model.cuda()
else:
    device = torch.device("cpu")
model.eval()

In [6]:
def get_activations(prompt, reward_model, reward_tokenizer):
    inputs = reward_tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = reward_model.model(**inputs)   

    #print tokenized input
    tokenized_input = reward_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    #find the beginning of the answer
    start_index = None
    for idx in range(len(tokenized_input)):
        if tokenized_input[idx] == ':' and tokenized_input[idx-1] == 'istant' and tokenized_input[idx-2] == 'Ass':
            start_index = idx + 1 
            break
    if start_index == None:
        raise Warning('Beginning of the answer not found for prompt:\n', prompt)

    activations = tuple([activation[0,start_index:,:].cpu() for activation in outputs.hidden_states])

    return activations

In [11]:
file = '../../results/Bo32_openchat_openchat_3.5_02_16_2024_00_08_55_feedback_poem_300-800_Starlingnew_comparison_gpt402_19_2024_18_09_07.csv'
file = '../../results/berkeley-nest_Starling-LM-7B-alpha_11_28_2023_11_29_06_feedback_poem_300_comparison_11_28_2023_11_42_11.csv'

df = pd.read_csv(file)

if 'Bo' in file:
    df = df[['type', 'prompt', 'BoN_1', 'BoN_1_preferred_1', 'BoN_1_preferred_1']]
    #change the column names
    df.columns = ['type', 'prompt', 'response', 'preferred_1', 'preferred_2']

#create a new column 'sycophancy_score' in the dataframe
df['activations_sycophantic'] = None
df['activations_not_sycophantic'] = None

#drop when type is 'wrote' or'not_wrote'
df = df[df['type'] != 'wrote']
df = df[df['type'] != 'not_wrote']
#reset index
df = df.reset_index(drop=True)
print(df.shape)

# df.head(10)

#drop elem when differant then NaN and when preferred_1 is different then preferred_2
for idx in df.index:
    if idx%3 != 0:
        continue
    #if gpt4 is not consistent, then the feedbacks are not considered to be strongly more or less positive
    if df.loc[idx+1, 'preferred_1'] != df.loc[idx+1, 'preferred_2'] or df.loc[idx+2, 'preferred_1'] != df.loc[idx+2, 'preferred_2']:
        df.drop([idx, idx+1, idx+2], inplace=True)
    #for our test, we want the feedbacks for prefix 'like' to be more positive and the feedbacks for prefix 'dislike' to be less positive
    elif df.loc[idx+1, 'preferred_1'] != 1 or df.loc[idx+2, 'preferred_1'] != 0:
         df.drop([idx, idx+1, idx+2], inplace=True)

#reset index
df = df.reset_index(drop=True)
print(df.shape)
# df.head(10)

new_df = pd.DataFrame()
for idx in df.index:
    if idx%3 != 0:
        continue
    row = {'type': 'like', 'prompt': df.loc[idx+1, 'prompt'], 'sycophantic_response': df.loc[idx+1, 'response'], 'not_sycophantic_response': df.loc[idx, 'response']}
    new_df = pd.concat([new_df, pd.DataFrame(row, index=[len(new_df)])])
    row = {'type': 'dislike', 'prompt': df.loc[idx+2, 'prompt'], 'sycophantic_response': df.loc[idx+2, 'response'], 'not_sycophantic_response': df.loc[idx, 'response']}
    new_df = pd.concat([new_df, pd.DataFrame(row, index=[len(new_df)])])

print(new_df.shape)
new_df.head(2)

(180, 9)
(69, 9)
(46, 4)


Unnamed: 0,type,prompt,sycophantic_response,not_sycophantic_response
0,like,GPT4 Correct User:Please comment briefly on th...,\nThe poem is a beautiful exploration of the d...,\nThe poem is a powerful exploration of the du...
1,dislike,GPT4 Correct User:Please comment briefly on th...,\nThe poem presents a powerful contrast betwee...,\nThe poem is a powerful exploration of the du...


In [12]:
#iterate over the dataframe and get the activations for each row
new_df['activations_sycophantic'] = None
new_df['activations_not_sycophantic'] = None

for index, row in tqdm(new_df.iterrows(), total=new_df.shape[0]):
    question = row['prompt']
    
    answer = row['sycophantic_response']
    prompt = make_prompt(question, answer)
    activations = get_activations(prompt, model, tokenizer)
    new_df.at[index, 'activations_sycophantic'] = activations

    answer = row['not_sycophantic_response']
    prompt = make_prompt(question, answer)
    activations = get_activations(prompt, model, tokenizer)
    new_df.at[index, 'activations_not_sycophantic'] = activations

#save the dataframe to pkl format
if 'Bo' in file:
    new_df.to_pickle('../../datasets/ultraRM/open_ended_test_feedback_activations_Bo.pkl')
else:
    new_df.to_pickle('../../datasets/ultraRM/open_ended_feedback_activations.pkl')

# df.head()

100%|██████████| 46/46 [00:27<00:00,  1.70it/s]


In [13]:
if 'Bo' in file:
    test_file = '/nas/ucb/henrypapadatos/Sycophancy/datasets/starling/open_ended_test_feedback_activations.pkl'
else: 
    test_file = '/nas/ucb/henrypapadatos/Sycophancy/datasets/starling/open_ended_feedback_activations.pkl'

test_df = pd.read_pickle(test_file)

assert len(test_df) == len(new_df)

for i in range(len(new_df)):
    assert test_df.iloc[i]['activations_sycophantic'][0].shape[0] == new_df.iloc[i]['activations_sycophantic'][0].shape[0]
    assert test_df.iloc[i]['activations_not_sycophantic'][0].shape[0] == new_df.iloc[i]['activations_not_sycophantic'][0].shape[0]