In [1]:
import torch
from tqdm import tqdm

from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

# Using IMDB dataset
# This function builds the dataset for training
# Input - config (PPOConfig), dataset_name, min_max text lengths
# Output - dataset of prompts (states) for the RL environment

In [3]:
def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    # Build a dataset to be used for the training.
    # It is a series of prompts (each with different length chosen randomly)
    # We will use it to generate the responses and compute the rewards.
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    # load the IMDB dataset
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    
    # Only choose reviews with more than 200 tokens
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
    input_size = LengthSampler(input_min_text_length, input_max_text_length)
    
    def tokenize(sample):
        # From each review just keep the first `input_size` tokens, this represents the prompt used to generate the response
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample
        
    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [4]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [5]:
config = PPOConfig(
    # Finetuned GPT2
    model_name="lvwerra/gpt2-imdb",
    learning_rate=1.41e-5,
    log_with="wandb",
)

In [56]:
# import wandb
# wandb.init()

In [7]:
dataset = build_dataset(config)



In [21]:
print(len(dataset))

24895


In [24]:
print(dataset)

Dataset({
    features: ['review', 'label', 'input_ids', 'query'],
    num_rows: 24895
})


In [18]:
# This is the model we are going to fine-tune with PPO
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)

# This is the reference model (frozen) for the KL divergence
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)

In [25]:
print(model)

AutoModelForCausalLMWithValueHead(
  (pretrained_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2SdpaAttention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  )

In [19]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token



In [26]:
print(tokenizer)

GPT2TokenizerFast(name_or_path='lvwerra/gpt2-imdb', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}


In [20]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)

device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [27]:
print(ppo_trainer)

<trl.trainer.ppo_trainer.PPOTrainer object at 0x7fda207b7340>


In [28]:
print(device)

0


In [29]:
# This is the reward model: a "positive" (e.g. a positive review) response will be given a high reward, a "negative" response will be given a low reward
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)

# Print some examples of sentiments generated by the reward model
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

text = "this movie was really bad!!"
print(sentiment_pipe(text, **sent_kwargs))

text = "this movie was really good!!"
print(sentiment_pipe(text, **sent_kwargs)) # [{'label': 'NEGATIVE', 'score': -2.335047960281372}, {'label': 'POSITIVE', 'score': 2.557039737701416}]



[[{'label': 'NEGATIVE', 'score': 2.335048198699951}, {'label': 'POSITIVE', 'score': -2.726576566696167}]]
[[{'label': 'NEGATIVE', 'score': -2.294790029525757}, {'label': 'POSITIVE', 'score': 2.557039737701416}]]


In [44]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)

# The configuration to generate responses (trajectories)
response_generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

In [49]:
response_generation_kwargs

{'min_length': -1,
 'top_k': 0.0,
 'top_p': 1.0,
 'do_sample': True,
 'pad_token_id': 50256,
 'max_new_tokens': 7}

In [55]:
# Dataloader is passed in at initialization
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    print(batch.keys())
    query_tensors = batch["input_ids"]

    #### Phase 1: Get trajectories from the offline policy
    # In this case we are only generating the responses, but not computing the log probabilities, which will be computed internally by the PPOTrainer.
    response_tensors = []

    # For every one of the queries (sample starting reviews)
    for query in query_tensors:
        gen_len = output_length_sampler()
        # Update the dictionary
        response_generation_kwargs["max_new_tokens"] = gen_len # Number of tokens to generate (chosen randomly)
        # Generate a response, and only save the response token ids
        response = ppo_trainer.generate(query, **response_generation_kwargs) # It returns the (query + response) tokens
        response_tensors.append(response.squeeze()[-gen_len:]) # Only take the tokens corresponding to the generated response (remove the prompt/query from the beginning)
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Phase 1: Compute rewards
    # Join the query (prompt) + response (generated tokens)
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    # Compute the reward for each of the texts (query + response)
    # shape: A list of dictionaries with two keys: POSITIVE and NEGATIVE. We are interested in the POSITIVE score. This will be our reward.
    # List of lists [[][][][]]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs) # [{'label': 'NEGATIVE', 'score': -2.335047960281372}, {'label': 'POSITIVE', 'score': 2.557039737701416}]
    
    # The reward for each text is the score (logit) corresponding to the POSITIVE class. 
    # Shape: A list of scalars, one for each generated response. 
    # It means we assign the reward to the whole response (not to each token).
    # We pick the NEGATIVE score to train the model
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
    
    #### Phase 1 + Phase 2: calculate the logprobs and then run the PPO update
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    
    ppo_trainer.log_stats(stats, batch, rewards)

0it [00:00, ?it/s]

dict_keys(['label', 'input_ids', 'query'])


0it [00:02, ?it/s]

KeyboardInterrupt



In [None]:
# Save the model
model.save_pretrained("gpt2-imdb-pos-v2", push_to_hub=False)
tokenizer.save_pretrained("gpt2-imdb-pos-v2", push_to_hub=False)