# RLHF Implementation

## Install and import packages and libraries

In [1]:
# %pip install --upgrade pip
# %pip install transformers trl wandb
# %pip install plotly -qqq

In [2]:
import torch
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead,RewardTrainer
from trl.core import LengthSampler
import random
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)


## Conifguring the model to be finetuned using RL

In [3]:
config = PPOConfig(
    model_name="gpt2",
    learning_rate=1.41e-5,
)

#sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

### Fetch data 

In [4]:
def build_dataset(config, input_min_text_length=2, input_max_text_length=200):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets

    #ds = load_dataset(dataset_name,'main', split="train[:10%]")

    df = pd.read_csv("./../input/feedback.csv",encoding='latin-1')
    ds = Dataset.from_pandas(df)

    ds = ds.rename_columns({"question": "review"})
    #ds = ds.filter(lambda x: len(x["review"]) > 20, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [5]:
dataset = build_dataset(config)


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [6]:
dataset[:10]

{'review': ['What is your name?',
  'What is your name?',
  'Do you know why turkeys became the official food of thanksgiving? ',
  'Do you know why turkeys became the official food of thanksgiving? ',
  'How do I float on my back in the water?Ê',
  'How do I float on my back in the water?Ê'],
 'answer': ['Answer : Amit is my name ',
  'Answer : My name is Amit',
  'To be honest, I donÕt know anything about that. I know that IÕm meant to know a lot about history and current events, but I havenÕt been programmed with those particular facts, sorry',
  'I know that the American Indians were the first to celebrate the first thanksgiving, and the first European settlers came to New England in the early 1600s, and it is likely that they celebrated the first thanksgiving in the late 1600s. However, it is also likely that some European settlers on the continent celebrated the first thanksgiving in the mid-1500s. A lot of people think that the main factor in thanksgiving is that the settlers in

In [7]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

from transformers import T5Tokenizer, T5ForConditionalGeneration
# model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl", device_map="auto")
# ref_model =  T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl", device_map="auto")
# tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xxl")
tokenizer.pad_token = tokenizer.eos_token

In [8]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)


In [9]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug


In [10]:
###############

In [11]:
rm_model_trained = AutoModelForSequenceClassification.from_pretrained("./../output/reward_model")
rm_tokenizer_trained = AutoTokenizer.from_pretrained("./../output/reward_model")

if rm_tokenizer_trained.pad_token is None:
    rm_tokenizer_trained.pad_token = rm_tokenizer_trained.eos_token
    rm_model_trained.config.pad_token_id = rm_model_trained.config.eos_token_id

In [12]:

text = ["this is really bad sentence","this is really goof  ec ece w"]
encoding = rm_tokenizer_trained(text, return_tensors="pt",padding=True,truncation=True)
outputs = rm_model_trained(**encoding)
predictions = outputs.logits.argmax(-1)
predictions,outputs.logits

(tensor([0, 0]),
 tensor([[-0.1686],
         [-0.1401]], grad_fn=<AddmmBackward0>))

In [13]:
gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id}


In [14]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)


generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    text = [q + r for q, r in zip(batch["query"], batch["response"])]
    encoding = rm_tokenizer_trained(text, return_tensors="pt",padding='max_length',truncation=True)
    outputs = rm_model_trained(**encoding)
    #rewards = outputs.logits.argmax(-1)
    #rewards = [torch.tensor(i) for i in rewards]
    rewards = [torch.tensor(i) for i in outputs.logits]

#     texts = [q + r for q, r in zip(batch["query"], batch["response"])]
#     pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
#     rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

0it [00:00, ?it/s]


In [15]:
2+2

4

## Evaluation

In [16]:
#### get a batch from the dataset
bs = 16
game_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
game_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()

response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    gen_len = output_length_sampler()
    output = ref_model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors_ref.append(output)
    output = model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors.append(output)

#### decode responses
game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
game_data["response (after)"] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q, r in zip(game_data["query"], game_data["response (before)"])]
encoding = rm_tokenizer_trained(texts, return_tensors="pt",padding='max_length',truncation=True)
outputs = rm_model_trained(**encoding)
game_data["rewards (before)"] = [torch.tensor(i) for i in outputs.logits]

texts = [q + r for q, r in zip(game_data["query"], game_data["response (after)"])]
encoding = rm_tokenizer_trained(texts, return_tensors="pt",padding='max_length',truncation=True)
outputs = rm_model_trained(**encoding)
game_data["rewards (after)"] = [torch.tensor(i) for i in outputs.logits]

# # store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results