# RLHF Implementation

## Install and import packages and libraries

In [2]:
# %pip install --upgrade pip
# %pip install transformers trl wandb
#%pip install plotly -qqq

In [3]:
import torch
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead,RewardTrainer
from trl.core import LengthSampler
import random
from operator import itemgetter
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)


## Reward Model Training

### Fetch and process dataset

In [4]:
df = pd.read_csv('feedback.csv')
df['tup'] = list(zip(df['answer'], df['feedback']))
df_g = df.groupby('question')['tup'].apply(list).reset_index()

df_g["sorted_tup"] = df_g["tup"].apply(lambda x :sorted(x,key=itemgetter(1)) )
df_g["chosen"] = df_g["sorted_tup"].apply(lambda x: x[-1][0])
df_g["chosen_score"] = df_g["sorted_tup"].apply(lambda x: x[-1][1])
df_g["rejected"] = df_g["sorted_tup"].apply(lambda x: x[0][0])
df_g["rejected_score"] = df_g["sorted_tup"].apply(lambda x: x[0][1])
df_g = df_g.dropna()
df_g.to_csv("treated_feedback.csv")

df_g = df_g[(df_g['chosen_score']>=4.0) & (df_g['rejected_score']<4.0)]
#df_g.to_csv("treated_feedback.csv")

# build a dataset with chosen and rejected responses
rows = []
for record in df_g.itertuples(index=True, name='Pandas'):
    if record is None or len(record) == 0:
        continue
    # build rows for rm training
    rows.append({
        "instruction": record.question,
        "chosen_response": record.chosen,
        "rejected_response": record.rejected
    })

# build dataset for training
prepared_dataset = Dataset.from_list(rows)
prepared_dataset.to_pandas()

Unnamed: 0,instruction,chosen_response,rejected_response
0,Can Instana use OpenTelemetry trace data?​,"Yes, Instana can use OpenTelemetry trace data....","Answer: Yes, Instana can use OpenTelemetry tr..."
1,Can Instana use OpenTelemetry trace data?​​​​​,"Yes, Instana can ingest OpenTelemetry trace da...","Answer: Yes, Instana can use OpenTelemetry tr..."
2,Can Maximo Visual Inspection run on prem?​​​,Answer: Yes. Maximo Visual Inspection is a cl...,"Answer: Yes, Maximo Visual Inspection can be ..."
3,Explain me step by step how can I integrate da...,ive tried to follow the documentation but I do...,ive tried to do it but it is not working. Answ...
4,What is watson knowledge catalog?,The IBM Watson Knowledge Catalog is a data cat...,Answer: The IBM Watson Knowledge Catalog is a...
5,what is cloud pak for watson aiops,? Answer: Cloud Pak for Watson AIOps is an AI-...,? Answer: Cloud Pak for Watson AIOps is a plat...
6,what is ibm?,Answer: IBM is an American multinational techn...,Answer: IBM is an American multinational techn...
7,which cloudpak provides business process autom...,? Answer: IBM Cloud Pak for Automation,? : What is the difference between IBM Cloud P...
8,who is the CEO of IBM?,Answer: Arvind Krishna Answer: Arvind Krishna,.


### Configuing the base model

In [25]:
#To train Reward Model we need to choose a base model to fine-tune.
rm_model_name = "distilroberta-base"

rm_model = AutoModelForSequenceClassification.from_pretrained(rm_model_name, num_labels=1)
rm_tokenizer = AutoTokenizer.from_pretrained(rm_model_name)

if rm_tokenizer.pad_token is None:
    rm_tokenizer.pad_token = rm_tokenizer.eos_token
    rm_model.config.pad_token_id = rm_model.config.eos_token_id
#This function combines instructions with chosen and rejected responses, creating two new strings.
#These strings are tokenized, becoming input for a reward model that learns to distinguish between good and bad responses based on these examples.
#The model will be optimized to assign higher values to preferred responses and lower values to rejected responses.
def formatting_func(examples):
    kwargs = {"padding": "max_length", "truncation": True, "max_length": 512, "return_tensors": "pt"}

    # Prepend the prompt and a line break to the original_response and response-1 fields.
    prompt_plus_chosen_response = examples["instruction"] + "\n" + examples["chosen_response"]
    prompt_plus_rejected_response = examples["instruction"] + "\n" + examples["rejected_response"]

    # Then tokenize these modified fields.
    tokens_chosen = rm_tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
    tokens_rejected = rm_tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)

    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }

formatted_dataset = prepared_dataset.map(formatting_func)
formatted_dataset = formatted_dataset.train_test_split()

rm_training_args = TrainingArguments(
    output_dir="./reward_model",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    logging_steps=200,
    num_train_epochs = 1,

)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

### Training the reward model

In [None]:

rm_trainer = RewardTrainer(
    model=rm_model,
    args=rm_training_args,
    tokenizer=rm_tokenizer,
    train_dataset=formatted_dataset["train"],
    eval_dataset=formatted_dataset["test"],
)

rm_trainer.train() 

### Save the trained reward model

In [6]:
rm_trainer.save_model() 

## Conifguring the model to be finetuned using RL

In [7]:
config = PPOConfig(
    model_name="lvwerra/gpt2-imdb",
    learning_rate=1.41e-5,
    #log_with="wandb",
)

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

### Fetch data 

In [8]:
def build_dataset(config, dataset_name="gsm8k", input_min_text_length=2, input_max_text_length=200):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets

    #ds = load_dataset(dataset_name,'main', split="train[:10%]")

    df = pd.read_csv("feedback.csv")
    ds = Dataset.from_pandas(df)

    ds = ds.rename_columns({"question": "review"})
    #ds = ds.filter(lambda x: len(x["review"]) > 20, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [9]:
dataset = build_dataset(config)


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Map:   0%|          | 0/268 [00:00<?, ? examples/s]

In [10]:
dataset[:10]

{'review': ['Can Maximo Visual Inspection run on prem?\u200b\u200b\u200b',
  'What is watson knowledge catalog?',
  'What is watson knowledge catalog?',
  'Can Instana use OpenTelemetry trace data?\u200b\u200b\u200b\u200b\u200b',
  'format it on a table',
  'what are the differences between Watson Orchestrate and other IBM automation tools?',
  'what are the differences between Watson Orchestrate and other IBM automation tools?',
  'What Watson Orchestrate is for?',
  'What Watson Orchestrate is for?',
  'what are the differences between Watson Orchestrate and other IBM automation tools?'],
 'answer': [' Answer: Yes, Maximo Visual Inspection can be run on premise. ',
  ' Answer: The IBM Watson Knowledge Catalog is a data catalog that serves as a single version of the truth for different users including data engineers, business analysts, data analysts, data scientists, and data citizens. Users can gain access to data they can trust, govern, curate, share and manage within an organizatio

In [11]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

from transformers import T5Tokenizer, T5ForConditionalGeneration
# model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl", device_map="auto")
# ref_model =  T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl", device_map="auto")
# tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xxl")
tokenizer.pad_token = tokenizer.eos_token

In [12]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)


In [13]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug


In [14]:
###############

In [15]:
rm_model_trained = AutoModelForSequenceClassification.from_pretrained("./reward_model")
rm_tokenizer_trained = AutoTokenizer.from_pretrained("./reward_model",padding=True,truncation=True)

if rm_tokenizer_trained.pad_token is None:
    rm_tokenizer_trained.pad_token = rm_tokenizer_trained.eos_token
    rm_model_trained.config.pad_token_id = rm_model_trained.config.eos_token_id
    
    
# text = ["this is one sentence", "this is another sentence"]
# encoding = tokenizer(text, return_tensors="pt")

# # forward pass
# outputs = model(**encoding)
# predictions = outputs.logits.argmax(-1)

In [16]:

text = ["this is really bad sentence","this is really goof  ec ece w"]
rm_tokenizer_trained = AutoTokenizer.from_pretrained("./reward_model")

encoding = rm_tokenizer_trained(text, return_tensors="pt",padding=True,truncation=True)

# # forward pass
# outputs = rm_model_trained(**encoding)
# predictions = outputs.logits.argmax(-1)
# predictions
# outputs.logits

In [17]:
# text = "this movie was really bad!!"
# sentiment_pipe(text, **sent_kwargs)


# RLHF

In [18]:
gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id}


In [21]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)


generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    text = [q + r for q, r in zip(batch["query"], batch["response"])]
    encoding = rm_tokenizer_trained(text, return_tensors="pt",padding='max_length',truncation=True)
    outputs = rm_model_trained(**encoding)
    #rewards = outputs.logits.argmax(-1)
    #rewards = [torch.tensor(i) for i in rewards]
    rewards = [torch.tensor(i) for i in outputs.logits]

#     texts = [q + r for q, r in zip(batch["query"], batch["response"])]
#     pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
#     rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

  rewards = [torch.tensor(i) for i in outputs.logits]
1it [02:03, 123.13s/it]


In [22]:
2+2

4

In [23]:
# #### get a batch from the dataset
# bs = 16
# game_data = dict()
# dataset.set_format("pandas")
# df_batch = dataset[:].sample(bs)
# game_data["query"] = df_batch["query"].tolist()
# query_tensors = df_batch["input_ids"].tolist()

# response_tensors_ref, response_tensors = [], []

# #### get response from gpt2 and gpt2_ref
# for i in range(bs):
#     gen_len = output_length_sampler()
#     output = ref_model.generate(
#         torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
#     ).squeeze()[-gen_len:]
#     response_tensors_ref.append(output)
#     output = model.generate(
#         torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
#     ).squeeze()[-gen_len:]
#     response_tensors.append(output)

# #### decode responses
# game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
# game_data["response (after)"] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

# #### sentiment analysis of query/response pairs before/after
# texts = [q + r for q, r in zip(game_data["query"], game_data["response (before)"])]
# game_data["rewards (before)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

# texts = [q + r for q, r in zip(game_data["query"], game_data["response (after)"])]
# game_data["rewards (after)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

# # store results in a dataframe
# df_results = pd.DataFrame(game_data)
# df_results

NameError: name 'sentiment_pipe' is not defined

In [24]:
# print("mean:")
# display(df_results[["rewards (before)", "rewards (after)"]].mean())
# print()
# print("median:")
# display(df_results[["rewards (before)", "rewards (after)"]].median())

mean:


NameError: name 'df_results' is not defined