# RLHF Implementation

## Import packages and libraries

In [17]:
import torch
from tqdm import tqdm
import pandas as pd
tqdm.pandas()
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead,RewardTrainer
from trl.core import LengthSampler
import random
from datasets import Dataset, load_dataset
from transformers import AutoModelForSequenceClassification,AutoTokenizer,TrainingArguments,pipeline


## Conifguring the model to be finetuned using RL

In [18]:
config = PPOConfig(
    model_name="gpt2",
    learning_rate=1.41e-5)


## Data

In [19]:
def build_dataset(config, input_min_text_length=2, input_max_text_length=200):
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    df = pd.read_csv("./../input/feedback.csv")
    ds = Dataset.from_pandas(df)
    ds = ds.rename_columns({"question": "review"})
    input_size = LengthSampler(input_min_text_length, input_max_text_length)
    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample
    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [20]:
dataset = build_dataset(config)
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Map:   0%|          | 0/268 [00:00<?, ? examples/s]

In [21]:
dataset[:10]

{'review': ['Can Maximo Visual Inspection run on prem?\u200b\u200b\u200b',
  'What is watson knowledge catalog?',
  'What is watson knowledge catalog?',
  'Can Instana use OpenTelemetry trace data?\u200b\u200b\u200b\u200b\u200b',
  'format it on a table',
  'what are the differences between Watson Orchestrate and other IBM automation tools?',
  'what are the differences between Watson Orchestrate and other IBM automation tools?',
  'What Watson Orchestrate is for?',
  'What Watson Orchestrate is for?',
  'what are the differences between Watson Orchestrate and other IBM automation tools?'],
 'answer': [' Answer: Yes, Maximo Visual Inspection can be run on premise. ',
  ' Answer: The IBM Watson Knowledge Catalog is a data catalog that serves as a single version of the truth for different users including data engineers, business analysts, data analysts, data scientists, and data citizens. Users can gain access to data they can trust, govern, curate, share and manage within an organizatio

## Configuring PPO Trainer

In [23]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

In [24]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)


In [25]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"


## Loading Trained Reward Model

In [26]:
rm_model_trained = AutoModelForSequenceClassification.from_pretrained("./../output/reward_model")
rm_tokenizer_trained = AutoTokenizer.from_pretrained("./../output/reward_model")

if rm_tokenizer_trained.pad_token is None:
    rm_tokenizer_trained.pad_token = rm_tokenizer_trained.eos_token
    rm_model_trained.config.pad_token_id = rm_model_trained.config.eos_token_id

In [28]:
gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id}


## Tuning using PPO 

In [29]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)


generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    text = [q + r for q, r in zip(batch["query"], batch["response"])]
    encoding = rm_tokenizer_trained(text, return_tensors="pt",padding='max_length',truncation=True)
    outputs = rm_model_trained(**encoding)
    rewards = [torch.tensor(i) for i in outputs.logits]

    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

  rewards = [torch.tensor(i) for i in outputs.logits]
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
1it [01:30, 90.60s/it]
