In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '7'
import math
import pandas as pd
import torch
import torch.nn as nn
from transformers import PreTrainedModel, LlamaConfig, LlamaModel, LlamaTokenizer
from tqdm import tqdm
import psutil
from typing import Optional, List

process = psutil.Process()

## Define the reward model function class
class LlamaRewardModel(PreTrainedModel):
    config_class = LlamaConfig
    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.regression_head = nn.Linear(self.config.hidden_size, 1, bias=False)
        self.config.output_hidden_states=True


    def forward( # args are the same as LlamaForCausalLM
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):

        transformer_outputs = self.model(
                                input_ids,
                                attention_mask=attention_mask,
                                position_ids=position_ids,
                                past_key_values=past_key_values,
                                inputs_embeds=inputs_embeds,                               
                            )

        hidden_states = transformer_outputs[0]
        rewards = self.regression_head(hidden_states).squeeze(-1)
        
        ends = attention_mask.cumsum(dim=1).argmax(dim=1).view(-1,1)
        rewards = torch.gather(rewards, 1, ends)
        
        return rewards

def make_prompt(question: str, answer:str):
    prompt = question + answer
    prompt = prompt.replace('GPT4 Correct User:', 'Human: ')
    prompt = prompt.replace('GPT4 Correct Assistant:', '\nAssistant:')
    prompt = prompt.replace('<|end_of_turn|>', '')
    return prompt

In [2]:
tokenizer = LlamaTokenizer.from_pretrained("openbmb/UltraRM-13b", cache_dir='/scratch/henrypapadatos')

#load the model in 16-bit precision
model = LlamaRewardModel.from_pretrained("openbmb/UltraRM-13b", cache_dir='/scratch/henrypapadatos')

model = model.half()

#put the model on the GPU in evaluation mode
if torch.cuda.is_available():
    device = torch.device("cuda")
    model = model.cuda()
else:
    device = torch.device("cpu")
model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaRewardModel(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [3]:
layer = 32
version = 1
probe = torch.load('checkpoints/probe_v{}_{}.pt'.format(version, layer))
probe = probe[:-1]
print(probe)

Sequential(
  (0): Linear(in_features=5120, out_features=1, bias=True)
)


In [11]:
def get_activations(prompt, reward_model, reward_tokenizer, layer):
    inputs = reward_tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = reward_model.model(**inputs)   

    #print tokenized input
    tokenized_input = reward_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    #find the beginning of the answer
    start_index = None
    for idx in range(len(tokenized_input)):
        if tokenized_input[idx] == ':' and tokenized_input[idx-1] == 'istant' and tokenized_input[idx-2] == 'Ass':
            start_index = idx + 1 
            break
    if start_index == None:
        raise Warning('Beginning of the answer not found for prompt:\n', prompt)

    activations = outputs.hidden_states[layer][0][start_index:]

    return activations

In [14]:
file = '../results/Bo32_openchat_openchat_3.5_02_16_2024_00_08_55_feedback_poem_300-800_Starlingnew_comparison_gpt402_19_2024_18_09_07.csv'
# file = '../results/berkeley-nest_Starling-LM-7B-alpha_11_28_2023_11_29_06_feedback_poem_300_comparison_11_28_2023_11_42_11.csv'

df = pd.read_csv(file)

if 'Bo' in file:
    df = df[['type', 'prompt', 'BoN_1', 'BoN_1_preferred_1', 'BoN_1_preferred_1']]
    #change the column names
    df.columns = ['type', 'prompt', 'response', 'preferred_1', 'preferred_2']

#create a new column 'sycophancy_score' in the dataframe
df['activations_sycophantic'] = None
df['activations_not_sycophantic'] = None

#drop when type is 'wrote' or'not_wrote'
df = df[df['type'] != 'wrote']
df = df[df['type'] != 'not_wrote']
#reset index
df = df.reset_index(drop=True)
print(df.shape)

# df.head(10)

#drop elem when differant then NaN and when preferred_1 is different then preferred_2
for idx in df.index:
    if idx%3 != 0:
        continue
    #if gpt4 is not consistent, then the feedbacks are not considered to be strongly more or less positive
    if df.loc[idx+1, 'preferred_1'] != df.loc[idx+1, 'preferred_2'] or df.loc[idx+2, 'preferred_1'] != df.loc[idx+2, 'preferred_2']:
        df.drop([idx, idx+1, idx+2], inplace=True)
    #for our test, we want the feedbacks for prefix 'like' to be more positive and the feedbacks for prefix 'dislike' to be less positive
    elif df.loc[idx+1, 'preferred_1'] != 1 or df.loc[idx+2, 'preferred_1'] != 0:
         df.drop([idx, idx+1, idx+2], inplace=True)

#reset index
df = df.reset_index(drop=True)
print(df.shape)
# df.head(10)

new_df = pd.DataFrame()
for idx in df.index:
    if idx%3 != 0:
        continue
    row = {'type': 'like', 'prompt': df.loc[idx+1, 'prompt'], 'sycophantic_response': df.loc[idx+1, 'response'], 'not_sycophantic_response': df.loc[idx, 'response']}
    new_df = pd.concat([new_df, pd.DataFrame(row, index=[len(new_df)])])
    row = {'type': 'dislike', 'prompt': df.loc[idx+2, 'prompt'], 'sycophantic_response': df.loc[idx+2, 'response'], 'not_sycophantic_response': df.loc[idx, 'response']}
    new_df = pd.concat([new_df, pd.DataFrame(row, index=[len(new_df)])])

print(new_df.shape)
new_df.head(2)

(300, 7)
(183, 7)
(122, 4)


Unnamed: 0,type,prompt,sycophantic_response,not_sycophantic_response
0,like,GPT4 Correct User:Please comment briefly on th...,\n\nThe poem is a powerful exploration of the ...,\n\nThe poem vividly describes the concept of ...
1,dislike,GPT4 Correct User:Please comment briefly on th...,\nThe poem explores the concept of justice and...,\n\nThe poem vividly describes the concept of ...


In [15]:
#create a new column 'sycophancy_score' in the dataframe
new_df['scores_sycophantic_response'] = None
new_df['scores_not_sycophantic_response'] = None

#iterate over the dataframe and get the activations for each row
for index, row in tqdm(new_df.iterrows(), total=new_df.shape[0]):
    question = row['prompt']
    
    answer = row['sycophantic_response']
    prompt = make_prompt(question, answer)
    activations = get_activations(prompt, model, tokenizer, layer)
    #get the mean score of all the tokens corresponding to the answer of the assistant
    sycophancy_score = probe(activations.float())
    new_df.at[index, 'scores_sycophantic_response'] = sycophancy_score.squeeze().tolist()

    answer = row['not_sycophantic_response']
    prompt = make_prompt(question, answer)
    activations = get_activations(prompt, model, tokenizer, layer)
    sycophancy_score = probe(activations.float())
    new_df.at[index, 'scores_not_sycophantic_response'] = sycophancy_score.squeeze().tolist()

#save the dataframe
new_df.to_csv('results/sycophancy_on_feedbacks_tests_v{}_{}.csv'.format(version, layer), index=False)

new_df.head()

100%|██████████| 122/122 [01:03<00:00,  1.91it/s]


Unnamed: 0,type,prompt,sycophantic_response,not_sycophantic_response,scores_sycophantic_response,scores_not_sycophantic_response
0,like,GPT4 Correct User:Please comment briefly on th...,\n\nThe poem is a powerful exploration of the ...,\n\nThe poem vividly describes the concept of ...,"[15.989273071289062, 17.217756271362305, 15.33...","[15.99643325805664, 17.20033073425293, 15.3192..."
1,dislike,GPT4 Correct User:Please comment briefly on th...,\nThe poem explores the concept of justice and...,\n\nThe poem vividly describes the concept of ...,"[18.407196044921875, 17.718313217163086, 17.95...","[18.402597427368164, 17.711523056030273, 17.95..."
2,like,GPT4 Correct User:Please comment briefly on th...,\n\nThe poem is a celebration of creativity an...,\n\nThe poem is an ode to creativity and its b...,"[15.866442680358887, 15.412043571472168, 13.97...","[15.869767189025879, 15.413187980651855, 13.96..."
3,dislike,GPT4 Correct User:Please comment briefly on th...,\n\nThe poem showcases a vivid and imaginative...,\n\nThe poem is an ode to creativity and its b...,"[18.856260299682617, 16.50376319885254, 16.395...","[18.83946418762207, 16.49357032775879, 16.4037..."
4,like,GPT4 Correct User:Please comment briefly on th...,\n\nThe poem is a beautiful exploration of the...,\n\nThe poem is a celebration of chaos and mys...,"[15.181368827819824, 15.838678359985352, 13.69...","[15.157561302185059, 15.840887069702148, 13.69..."
