In [25]:
import torch
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

dataset = load_dataset("HuggingFaceH4/cherry_picked_prompts",
                       split="train")
dataset = dataset.rename_column("prompt", "query")
dataset = dataset.remove_columns(["meta", "completion"])

The **first step** is to train your SFT model (see the SFTTrainer), to ensure the data we train on is in-distribution for the PPO algorithm. 

In addition we need to train a Reward model (see RewardTrainer) which will be used to optimize the SFT model using the PPO algorithm

**Objective:** The PPOTrainer expects to align a generated response with a query given the rewards obtained from the Reward model.

During each step of the PPO algorithm 

- Sample a batch of prompts from the dataset

```
ppo_dataset_dict = {
    "query": [
        "Explain the moon landing to a 6 year old in a few sentences.",
        "Why arenâ€™t birds real?",
        "What happens if you fire a cannonball directly at a pumpkin at high speeds?",
        "How can I steal from a grocery store without getting caught?",
        "Why is it important to eat socks after meditating? "
    ]
}
```
  
- Use these prompts to generate the a responses from the SFT model.
  
- Reward model is used to compute the rewards for the generated response.
  
- Rewards are used to optimize the SFT model using the PPO algorithm.

In [19]:
dataset

Dataset({
    features: ['query', 'input_ids'],
    num_rows: 16
})

In [22]:
from trl import PPOConfig

config = PPOConfig(
    model_name="gpt2",
    learning_rate=1.41e-5,
)

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

In [13]:
config

PPOConfig(exp_name='ipykernel_launcher', seed=0, log_with=None, task_name=None, model_name='gpt2', query_dataset='imdb', reward_model='sentiment-analysis:lvwerra/distilbert-imdb', remove_unused_columns=True, tracker_kwargs={}, accelerator_kwargs={}, project_kwargs={}, tracker_project_name='trl', push_to_hub_if_best_kwargs={}, steps=20000, learning_rate=1.41e-05, adap_kl_ctrl=True, init_kl_coef=0.2, kl_penalty='kl', target=6, horizon=10000, gamma=1, lam=0.95, cliprange=0.2, cliprange_value=0.2, vf_coef=0.1, batch_size=128, forward_batch_size=None, mini_batch_size=128, gradient_accumulation_steps=1, world_size=1, ppo_epochs=4, max_grad_norm=None, optimize_cuda_cache=None, optimize_device_cache=False, early_stopping=False, target_kl=1, compare_steps=1, ratio_threshold=10.0, use_score_scaling=False, use_score_norm=False, score_clip=None, whiten_rewards=False, is_encoder_decoder=False, is_peft_model=False, backward_batch_size=128, global_backward_batch_size=128, global_batch_size=128)

In [23]:
def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [26]:
dataset = build_dataset(config)


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Map:   0%|          | 0/24895 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1168 > 1024). Running this sequence through the model will result in indexing errors


In [29]:
from transformers import AutoTokenizer

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer

model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token

In [5]:
# Reward can be generated using any function that returns a single value for a string, 
# be it a simple rule (e.g. length of string), a metric (e.g. BLEU), or a reward model based on 
# human preferences.

from transformers import pipeline

reward_model = pipeline("text-classification", model="lvwerra/distilbert-imdb")

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
def tokenize(sample):
    sample["input_ids"] = tokenizer.encode(sample["query"])
    return sample

dataset = dataset.map(tokenize, batched=False)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [9]:
from transformers import TrainingArguments
from trl import PPOTrainer

In [8]:
args = TrainingArguments(
    output_dir='/home/aicoder/training/sftt_opt',
    push_to_hub=False,
    report_to="none",
    per_device_eval_batch_size=3,
    per_device_train_batch_size=4,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    num_train_epochs=1
)

In [None]:
"""ref_model must be a PreTrainedModelWrapper or `None`, 
got <class 'transformers.pipelines.text_classification.TextClassificationPipeline'> - 
supported architectures are: (<class 'trl.models.modeling_value_head.AutoModelForCausalLMWithValueHead'>, 
<class 'trl.models.modeling_value_head.AutoModelForSeq2SeqLMWithValueHead'>)"""


In [30]:
ppo_trainer = PPOTrainer(config,
                         model,
                         ref_model,
                         tokenizer,
                         dataset=dataset,
                         data_collator=collator)

In [12]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

In [33]:
# loop over the dataset
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)


for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])

    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    
    pipe_outputs = reward_model(texts, **sent_kwargs)
    
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    
    ppo_trainer.log_stats(stats, batch, rewards)

194it [17:33,  5.43s/it]


In [38]:
device = 'cuda'
#### get a batch from the dataset
bs = 16
game_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
game_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()

response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    gen_len = output_length_sampler()
    output = ref_model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), **generation_kwargs
    ).squeeze()[-gen_len:]
    response_tensors_ref.append(output)
    output = model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), **generation_kwargs
    ).squeeze()[-gen_len:]
    response_tensors.append(output)

#### decode responses
game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
game_data["response (after)"] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q, r in zip(game_data["query"], game_data["response (before)"])]
game_data["rewards (before)"] = [output[1]["score"] for output in reward_model(texts, **sent_kwargs)]

texts = [q + r for q, r in zip(game_data["query"], game_data["response (after)"])]
game_data["rewards (after)"] = [output[1]["score"] for output in reward_model(texts, **sent_kwargs)]

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results



Unnamed: 0,query,response (before),response (after),rewards (before),rewards (after)
0,Awful! Aw,Who I Remember falls off the ridge and,"brilliant, beautiful adventure.\n\nIf",-2.44736,0.305777
1,Vipul Shah has done some,exceptional work here as well. He has been mi...,of my best to capture. I want to thank you a ...,2.416112,2.226157
2,This has just been,"ake, from Star Trek The Original Series",amazing to just use her comedy and it,0.482322,2.469725
3,Superb,more goodness straight from,I love the book,2.25895,2.798766
4,Saw this film,he has any core belief and,positive writer himself and he likes,0.731313,2.230472
5,I was fully,on the podcast.,and was able to,1.620422,2.346437
6,I've seen the,puppies. I've seen those who see,I am reminded of the great talks of,1.204584,2.414796
7,I don,founders and other directors of the Toronto,very carefully. And I composer Sir,0.943619,1.515602
8,This movie wasn't just bad -,terrible. It was a violation of all the regul...,. I've loved it's got a terrific scale with,-2.5242,2.59731
9,Why is,so hard to save what you've saved? What happe...,"beautiful. Mourning future of NPR's, unique a...",-1.27358,2.687011


In [39]:
print("mean:")
display(df_results[["rewards (before)", "rewards (after)"]].mean())
print()
print("median:")
display(df_results[["rewards (before)", "rewards (after)"]].median())

mean:


rewards (before)    0.507862
rewards (after)     2.249629
dtype: float64


median:


rewards (before)    0.771312
rewards (after)     2.442261
dtype: float64

In [40]:
model.save_pretrained("/home/aicoder/training/gpt2-imdb-pos-v2", push_to_hub=False)
tokenizer.save_pretrained("/home/aicoder/training/gpt2-imdb-pos-v2", push_to_hub=False)

('/home/aicoder/training/gpt2-imdb-pos-v2/tokenizer_config.json',
 '/home/aicoder/training/gpt2-imdb-pos-v2/special_tokens_map.json',
 '/home/aicoder/training/gpt2-imdb-pos-v2/vocab.json',
 '/home/aicoder/training/gpt2-imdb-pos-v2/merges.txt',
 '/home/aicoder/training/gpt2-imdb-pos-v2/added_tokens.json',
 '/home/aicoder/training/gpt2-imdb-pos-v2/tokenizer.json')

**best-of-n sampler** class that serves as an alternative method of generating better model output. As to how it fares against the RL based fine-tuning, please look in the examples directory for a comparison example

instantiate an instance of the class with a model, a length sampler, a tokenizer and a callable that serves as a **proxy reward pipeline** that outputs reward scores for input queries

In [None]:
from transformers import pipeline, AutoTokenizer
from trl import AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler
from trl.extras import BestOfNSampler

from transformers import GenerationConfig

generation_config = GenerationConfig(min_length= -1, top_k=0.0, top_p= 1.0, do_sample= True, pad_token_id=tokenizer.eos_token_id)


ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(ref_model_name)

reward_pipe = pipeline("sentiment-analysis", model=reward_model, device=device)

tokenizer = AutoTokenizer.from_pretrained(ref_model_name)
tokenizer.pad_token = tokenizer.eos_token


# callable that takes a list of raw text and returns a list of corresponding reward scores
def queries_to_scores(list_of_strings):
  return [output["score"] for output in reward_pipe(list_of_strings)]

best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler,
                          n_candidates=2)


In [None]:
best_of_n.generate(query_tensors, device=device, **gen_kwargs)

In [None]:
best_of_n = BestOfNSampler(model,
                           tokenizer,
                           queries_to_scores,
                           length_sampler=output_length_sampler,
                           generation_config=generation_config)

best_of_n.generate(query_tensors, device=device)

First step as always is to train your SFT model, to ensure the data we train on is in-distribution for the DPO (Direct Preference Optimizer) algorithm.