In [1]:
!pip install transformers trl wandb

Collecting trl
  Downloading trl-0.7.4-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.9/133.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting wandb
  Downloading wandb-0.16.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting tyro>=0.5.11
  Downloading tyro-0.6.0-py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting setproctitle
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.38.0-py2.py3-none-any.whl (252 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [90]:
import torch
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

from trl import DPOTrainer
from trl.core import LengthSampler

In [None]:
import wandb

wandb.init()

In [5]:
def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [6]:
config = PPOConfig(
    model_name="lvwerra/gpt2-imdb",
    learning_rate=1.41e-5,
    log_with="wandb",
)

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

In [7]:
dataset = build_dataset(config)


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Found cached dataset imdb (/home/jpsingh/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Loading cached processed dataset at /home/jpsingh/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-cd96d6dda9f74d63.arrow
Loading cached processed dataset at /home/jpsingh/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-fd42dc5906724bf4.arrow


In [13]:
len(dataset['query']), len(set(dataset['query']))

(24895, 21171)

In [14]:
dataset[:10]

{'review': ['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far

In [15]:
dataset

Dataset({
    features: ['review', 'label', 'input_ids', 'query'],
    num_rows: 24895
})

In [None]:
# For each question generate two answers. Then run a bert classifier to get the sentiment score.
# Prefer the one with more positive sentiment.

# questions = set()
# for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
#     query_tensors = batch["input_ids"]

#     #### Get response from gpt2
#     response_tensors = []
#     for query in query_tensors:
#         gen_len = output_length_sampler()
#         generation_kwargs["max_new_tokens"] = gen_len
#         response = ppo_trainer.generate(query, **generation_kwargs)
#         response_tensors.append(response.squeeze()[-gen_len:])
#     batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

#     #### Compute sentiment score
#     texts = [q + r for q, r in zip(batch["query"], batch["response"])]
#     pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
#     rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

#     #### Run PPO step
#     stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
#     ppo_trainer.log_stats(stats, batch, rewards)



In [10]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token

In [17]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


In [16]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msingh-japinder[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [41]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [19]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)
output_length_sampler

<trl.core.LengthSampler at 0x7f903c43af20>

In [20]:
gen_len = output_length_sampler()

In [33]:
dataset[:4]['input_ids']

[tensor([   40, 26399,   314,  3001,   327, 47269, 20958]),
 tensor([    1,    40,  1703, 44269,    25, 12550]),
 tensor([1532,  691,  284, 3368, 1642,  428]),
 tensor([1212, 2646])]

In [39]:
generation_kwargs["max_new_tokens"] = gen_len
device = ppo_trainer.accelerator.device
query = dataset[:4]['input_ids']
response = ppo_trainer.generate(query, **generation_kwargs)
response_tensors = [t.squeeze()[-gen_len:] for t in response]
decoded_response = [tokenizer.decode(r.squeeze()) for r in response_tensors]
response


[tensor([   40, 26399,   314,  3001,   327, 47269, 20958,   257,  1178,  2745,
          2084,   379,  3776, 19485,   810,   428], device='cuda:0'),
 tensor([    1,    40,  1703, 44269,    25, 12550,  2097,     1,   318,  1231,
           257,  4719,   530,   286,   616], device='cuda:0'),
 tensor([ 1532,   691,   284,  3368,  1642,   428, 13526,  3807,   804,   772,
         19863,   922,    11,   475,  6411], device='cuda:0'),
 tensor([ 1212,  2646,   373,  7867,   416, 28440,   338,  1380,  2645,    78,
            11], device='cuda:0')]

In [77]:
def generate_completions(batch):
    tokenized_inputs = batch['input_ids']
    queries = batch['query']
    # For each input generate two responses.
    response = ppo_trainer.generate(tokenized_inputs, **generation_kwargs)
    response_tensors = [t.squeeze()[-gen_len:] for t in response]
    decoded_response_1 = [tokenizer.decode(r.squeeze()) for r in response_tensors]
    
    response = ppo_trainer.generate(tokenized_inputs, **generation_kwargs)
    response_tensors = [t.squeeze()[-gen_len:] for t in response]
    decoded_response_2 = [tokenizer.decode(r.squeeze()) for r in response_tensors]
    
    # Collect the input and response for each.
    response_1 = [q + r for q, r in zip(queries, decoded_response_1)]
    response_2 = [q + r for q, r in zip(queries, decoded_response_2)]
    
    # Get the sentiment of response_1 and response_2 using bert sentiment classifier model.
    pipe_outputs_1 = sentiment_pipe(response_1, **sent_kwargs)    
    rewards_1 = [torch.tensor(output[1]["score"]) for output in pipe_outputs_1]
    
    pipe_outputs_2 = sentiment_pipe(response_2, **sent_kwargs)
    rewards_2 = [torch.tensor(output[1]["score"]) for output in pipe_outputs_2]
    
    # Compute the preferred and rejected.
    chosen = [r1 if t1.item() >= t2.item() else r2 for r1, r2, t1, t2 in zip(decoded_response_1, decoded_response_2, rewards_1, rewards_2)]
    rejected = [r2 if t1.item() >= t2.item() else r1 for r1, r2, t1, t2 in zip(decoded_response_1, decoded_response_2, rewards_1, rewards_2)]
    # return decoded_response_1, decoded_response_2, rewards_1, rewards_2, preferred, rejected
    return {
        'prompt': queries,
        'chosen': chosen,
        'rejected': rejected,
    }
    

In [78]:
batch  = generate_completions(dataset[:2])

In [79]:
dpo_dataset = dataset.map(generate_completions, batched=True)

                                                                 

In [80]:
dpo_dataset.save_to_disk('dpo_dataset')

                                                                                                 

In [81]:
dpo_dataset

Dataset({
    features: ['review', 'label', 'input_ids', 'query', 'prompt', 'chosen', 'rejected'],
    num_rows: 24895
})

In [82]:
from datasets import load_from_disk
dpo_dataset2 = load_from_disk('dpo_dataset')

In [83]:
dpo_dataset2

Dataset({
    features: ['review', 'label', 'input_ids', 'query', 'prompt', 'chosen', 'rejected'],
    num_rows: 24895
})

In [95]:
dpo_dataset2[:10]

{'review': ['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far

In [97]:
# Split dpo_dataset2 into train and test split.
dpo_dataset_train_test = dpo_dataset2.train_test_split(test_size=0.1, shuffle=True)
train_dataset = dpo_dataset_train_test['train']
eval_dataset = dpo_dataset_train_test['test']

In [98]:
batch_size = 32
max_steps = 1000
gradient_accumulation_steps = 1
learning_rate = 1e-3
gradient_checkpointing = False
beta = 0.1
gen_len = output_length_sampler()
max_new_tokens=gen_len
max_target_length = 128
max_prompt_length = 128

training_args = TrainingArguments(
    per_device_train_batch_size=batch_size,
    max_steps=max_steps,
    remove_unused_columns=False,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    evaluation_strategy="steps",
    logging_first_step=True,
    logging_steps=10,  # match results in blog post
    eval_steps=500,
    output_dir="./test",
    optim="adamw_hf",
    warmup_steps=150,
    report_to=None,
    bf16=True,
    gradient_checkpointing=gradient_checkpointing,
    # TODO: uncomment that on the next transformers release
    # gradient_checkpointing_kwargs=script_args.gradient_checkpointing_kwargs,
)
model = AutoModelForCausalLM.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLM.from_pretrained(config.model_name)


dpo_trainer = DPOTrainer(
    model,
    ref_model,
    args=training_args,
    beta=beta,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    max_length=max_new_tokens,
    max_target_length=max_target_length,
    max_prompt_length=max_prompt_length,
    generate_during_eval=True,
)

In [101]:
import mlflow

# End the current run
mlflow.end_run()

In [102]:
dpo_trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
500,0.3444,0.789826,-6.049232,-8.301433,0.371988,2.2522,-106.06662,-83.289574,-57.871819,-57.633175
1000,0.3445,0.835098,-6.83338,-9.375635,0.375635,2.542255,-116.808647,-91.131058,-60.79221,-60.542439


Trainer is attempting to log a value of "<wandb.data_types.Table object at 0x7f91c7fc3b80>" of type <class 'wandb.data_types.Table'> for key "game_log" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "<wandb.data_types.Table object at 0x7f91c7fc3b80>" of type <class 'wandb.data_types.Table'> for key "train/game_log" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "<wandb.data_types.Table object at 0x7f8fd47480d0>" of type <class 'wandb.data_types.Table'> for key "game_log" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "<wandb.data_types.Table object at 0x7f8fd47480d0>" of type <class 'wandb.data_types.Table'> for key "train/game_log" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorr

TrainOutput(global_step=1000, training_loss=0.3900393166542053, metrics={'train_runtime': 1514.1956, 'train_samples_per_second': 169.067, 'train_steps_per_second': 0.66, 'total_flos': 0.0, 'train_loss': 0.3900393166542053, 'epoch': 11.36})

In [103]:
gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id}


In [104]:
#### get a batch from the dataset
bs = 16
game_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
game_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()

response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    gen_len = output_length_sampler()
    output = ref_model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors_ref.append(output)
    output = model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors.append(output)

#### decode responses
game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
game_data["response (after)"] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q, r in zip(game_data["query"], game_data["response (before)"])]
game_data["rewards (before)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

texts = [q + r for q, r in zip(game_data["query"], game_data["response (after)"])]
game_data["rewards (after)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results



Unnamed: 0,query,response (before),response (after),rewards (before),rewards (after)
0,This film is probably,"the funniest thing in the world,",best across and centre our films and Mer,2.612724,2.459077
1,"""A death at","sea of 2008"". To own this movie is simply a s...","""A death at increases increases in light<|endo...",-2.593113,-0.22293
2,Tim (Gary Daniels),buys him back the money he pays him to give him,Tim (Gary Daniels) The witty<|endoftext|>,0.32214,0.619284
3,This movie,is really scary and deadly on it's own.<,is witty & and enjoyable. A well written<|end...,2.484784,2.838642
4,This movie,I have saved 25 finales. I really wanted to,This movie is witty and compelling as its<|end...,0.098681,2.596799
5,This film is the freshman,in the class and he comes up,the freshman to Michael Summersberg<|endoftext|>,0.852844,0.475493
6,Tea Leon,", someone throws in a potion and tells Leon, """,Tea Leon is witty<|endoftext|>,0.390674,1.38845
7,Man on Fire was hot.,Christine Simeone and Amy Poehler did,. It is accompanied by his wonderful performan...,1.409156,2.648448
8,The movie has one,decent working R-rated title,that is witty as a romantic,0.319576,2.515166
9,"""The seventh sign"" borrows","heavily from the proverbs/memorabilia track ""","The seventh sign"" borrows elements from The Ch...",0.131347,0.039257


In [105]:
print("mean:")
display(df_results[["rewards (before)", "rewards (after)"]].mean())
print()
print("median:")
display(df_results[["rewards (before)", "rewards (after)"]].median())

mean:


rewards (before)    0.658153
rewards (after)     1.628321
dtype: float64


median:


rewards (before)    0.621759
rewards (after)     2.136567
dtype: float64