Mean Reward: The primary goal is to maximize the reward achieved by the model during RL training. 

Objective KL Divergence: KL divergence (Kullback-Leibler divergence) measures the dissimilarity between two probability distributions.

In [1]:
import torch
from transformers import GPT2Tokenizer

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer

In [None]:
# load a model. 
model = AutoModelForCausalLMWithValueHead.from_pretrained('gpt2')
# This is going to be 2nd model, only for reference
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained('gpt2')
# Common tokenizer 
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# bring on the trainer
ppo_config = {"batch_size": 1,
              "learning_rate": 1e-5}
config = PPOConfig(**ppo_config)
ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)
# trainer takes the config, two models and tokenizer

In [None]:
query_txt = "This morning I went to the "
query_tensor = tokenizer.encode(query_txt, return_tensors='pt').to(model.pretrained_model.device)

In [None]:
gen_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token,
    "max_new_tokens": 20
}

In [None]:
response_tensor = ppo_trainer.generate([item for item in query_tensor],
                                       return_prompt=False,
                                       **gen_kwargs)

response_text = tokenizer.decode(response_tensor[0])

In [None]:
# This is the reward and train loop...
reward = [torch.tensor(1.0, device=model.pretrained_model.device)]

# 6. train model with ppo
train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)

In [None]:
# push the model on the Hub
model.push_to_hub("fine_tune_gpt_ppo")

# or save it locally
model.save_pretrained("fine_tune_gpt_ppo")

# load the model from the Hub, the model is loaded with AutoModel directly
# not the TRL class
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("my-fine-tuned-model-ppo")

In [None]:
# 2. Create optimizer
import bitsandbytes as bnb

sgd_optimizer = torch.optim.SGD(model.parameters(), lr=config.learning_rate)

adam_optimizer = bnb.optim.Adam8Bit(model.parameters(), lr=config.learning_rate)


In [None]:
# PPO_trainers with different optimizers
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(sgd_optimizer, gamma=0.9)
updated_trainer_sgd = PPOTrainer(ppo_config, model,
                                 model_ref, tokenizer,
                                 optimizer=sgd_optimizer,
                                 lr_scheduler=lr_scheduler)

updated_trainer_adam = PPOTrainer(ppo_config, model, model_ref, tokenizer, optimizer=adam_optimizer)