In [1]:
!pip install git+https://github.com/lvwerra/trl.git@25fa1bd

Collecting git+https://github.com/lvwerra/trl.git@25fa1bd
  Cloning https://github.com/lvwerra/trl.git (to revision 25fa1bd) to /tmp/pip-req-build-2k3cm3l5
  Running command git clone --filter=blob:none --quiet https://github.com/lvwerra/trl.git /tmp/pip-req-build-2k3cm3l5
[0m  Running command git checkout -q 25fa1bd
  Resolved https://github.com/lvwerra/trl.git to commit 25fa1bd
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: trl
  Building wheel for trl (setup.py) ... [?25ldone
[?25h  Created wheel for trl: filename=trl-0.4.2.dev0-py3-none-any.whl size=67533 sha256=72c911b073f53c7babc812242a349cf76e392b705b732adbf807da7c201d12ab
  Stored in directory: /tmp/pip-ephem-wheel-cache-e8wayng8/wheels/24/b4/20/2fa3a1e47c0411c39e198029315e3af2a2c1d59132913f136f
Successfully built trl
Installing collected packages: trl
Successfully installed trl-0.4.2.dev0


In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()

In [3]:
model_name="google/flan-t5-base"
#model_name="google/flan-t5-large"
huggingface_dataset_name = "knkarthick/dialogsum"

dataset_original = load_dataset(huggingface_dataset_name)

dataset_original

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [4]:
def build_dataset(model_name,
                  dataset_name,
                  input_min_text_length, 
                  input_max_text_length):

    """
    Preprocess the dataset and split it into train and test parts.

    Parameters:
    - model_name (str): Tokenizer model name.
    - dataset_name (str): Name of the dataset to load.
    - input_min_text_length (int): Minimum length of the dialogues.
    - input_max_text_length (int): Maximum length of the dialogues.
        
    Returns:
    - dataset_splits (datasets.dataset_dict.DatasetDict): Preprocessed dataset containing train and test parts.
    """
    
    # load dataset (only "train" part will be enough for this lab).
    dataset = load_dataset(dataset_name, split="train")
    
    # Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
    dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)

    # Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
    
    def tokenize(sample):
        
        # Wrap each dialogue with the instruction.
        prompt = f"""
Summarize the following conversation.

{sample["dialogue"]}

Summary:
"""
        sample["input_ids"] = tokenizer.encode(prompt)
        
        # This must be called "query", which is a requirement of our PPO library.
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # Tokenize each dialogue.
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")
    
    # Split the dataset into train and test parts.
    dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

    return dataset_splits

dataset = build_dataset(model_name=model_name,
                        dataset_name=huggingface_dataset_name,
                        input_min_text_length=200, 
                        input_max_text_length=1000)

print(dataset)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/10022 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 8017
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 2005
    })
})


In [5]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"\ntrainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [6]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, 
                                              torch_dtype=torch.bfloat16)

peft_model = PeftModel.from_pretrained(model, 
                                       './peft-dialogue-summary-checkpoint-local/', 
                                       lora_config=lora_config,
                                       torch_dtype=torch.bfloat16, 
                                       device_map="auto",                                       
                                       is_trainable=True)

print(f'PEFT model parameters to be updated:\n{print_number_of_trainable_model_parameters(peft_model)}\n')

PEFT model parameters to be updated:

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%



In [7]:
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model,                                                               
                                                               torch_dtype=torch.bfloat16,
                                                               is_trainable=True)

print(f'PPO model parameters to be updated (ValueHead + 769 params):\n{print_number_of_trainable_model_parameters(ppo_model)}\n')
print(ppo_model.v_head)

PPO model parameters to be updated (ValueHead + 769 params):

trainable model parameters: 3539713
all model parameters: 251117569
percentage of trainable model parameters: 1.41%

ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=768, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


In [8]:
ref_model = create_reference_model(ppo_model)

print(f'Reference model parameters to be updated:\n{print_number_of_trainable_model_parameters(ref_model)}\n')

Reference model parameters to be updated:

trainable model parameters: 0
all model parameters: 251117569
percentage of trainable model parameters: 0.00%



In [9]:
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map="auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name, device_map="auto")
print(toxicity_model.config.id2label)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

{0: 'nothate', 1: 'hate'}


In [10]:
non_toxic_text = "#Person 1# tells Tommy that he didn't like the movie."

toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors="pt").input_ids

#logits = toxicity_model(input_ids=toxicity_input_ids).logits
logits = toxicity_model(input_ids=toxicity_input_ids.to("cuda")).logits

print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# Print the probabilities for [not hate, hate]
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# get the logits for "not hate" - this is the reward!
not_hate_index = 0
nothate_reward = (logits[:, not_hate_index]).tolist()
print(f'reward (high): {nothate_reward}')

logits [not hate, hate]: [3.114102840423584, -2.489619255065918]
probabilities [not hate, hate]: [0.9963293671607971, 0.003670602338388562]
reward (high): [3.114102840423584]


In [11]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
non_toxic_text = "#Person 1# tells Tommy that he didn't like the movie."

toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors="pt").input_ids
toxicity_input_ids = toxicity_input_ids.to(device)

logits = toxicity_model(input_ids=toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# Print the probabilities for [not hate, hate]
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# get the logits for "not hate" - this is the reward!
not_hate_index = 0
nothate_reward = (logits[:, not_hate_index]).tolist()
print(f'reward (high): {nothate_reward}')

logits [not hate, hate]: [3.114102840423584, -2.489619255065918]
probabilities [not hate, hate]: [0.9963293671607971, 0.003670602338388562]
reward (high): [3.114102840423584]


In [12]:
toxic_text = "#Person 1# tells Tommy that the movie was terrible, dumb and stupid."

toxicity_input_ids = toxicity_tokenizer(toxic_text, return_tensors="pt").input_ids
toxicity_input_ids = toxicity_input_ids.to(device)

logits = toxicity_model(toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# Print the probabilities for [not hate, hate]
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# Get the logits for "not hate" - this is the reward!
nothate_reward = (logits[:, not_hate_index]).tolist() 
print(f'reward (low): {nothate_reward}')

logits [not hate, hate]: [-0.692115306854248, 0.37226977944374084]
probabilities [not hate, hate]: [0.25647231936454773, 0.7435276508331299]
reward (low): [-0.692115306854248]


In [13]:
device = 0 if torch.cuda.is_available() else "cpu"

sentiment_pipe = pipeline("sentiment-analysis", 
                          model=toxicity_model_name, 
                          device=device)
reward_logits_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # Set to "none" to retrieve raw logits.
    "batch_size": 16
}

reward_probabilities_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "softmax", # Set to "softmax" to apply softmax and retrieve probabilities.
    "batch_size": 16
}

print("Reward model output:")
print("For non-toxic text")
print(sentiment_pipe(non_toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text, **reward_probabilities_kwargs))
print("For toxic text")
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_probabilities_kwargs))

Reward model output:
For non-toxic text
[{'label': 'nothate', 'score': 3.114102840423584}, {'label': 'hate', 'score': -2.489619255065918}]
[{'label': 'nothate', 'score': 0.9963293671607971}, {'label': 'hate', 'score': 0.003670602571219206}]
For toxic text
[{'label': 'hate', 'score': 0.37226977944374084}, {'label': 'nothate', 'score': -0.692115306854248}]
[{'label': 'hate', 'score': 0.7435276508331299}, {'label': 'nothate', 'score': 0.2564723491668701}]


In [14]:
print(sentiment_pipe(non_toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text, **reward_probabilities_kwargs))

[{'label': 'nothate', 'score': 3.114102840423584}, {'label': 'hate', 'score': -2.489619255065918}]
[{'label': 'nothate', 'score': 0.9963293671607971}, {'label': 'hate', 'score': 0.003670602571219206}]


In [15]:
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_probabilities_kwargs))

[{'label': 'hate', 'score': 0.37226977944374084}, {'label': 'nothate', 'score': -0.692115306854248}]
[{'label': 'hate', 'score': 0.7435276508331299}, {'label': 'nothate', 'score': 0.2564723491668701}]


In [16]:
text = "I love you."
print(sentiment_pipe(text, **reward_logits_kwargs))
print(sentiment_pipe(text, **reward_probabilities_kwargs))

[{'label': 'nothate', 'score': 4.620528697967529}, {'label': 'hate', 'score': -4.193256378173828}]
[{'label': 'nothate', 'score': 0.9998513460159302}, {'label': 'hate', 'score': 0.0001486473047407344}]


In [17]:
text = "How are you doing today?"
print(sentiment_pipe(text, **reward_logits_kwargs))
print(sentiment_pipe(text, **reward_probabilities_kwargs))

[{'label': 'nothate', 'score': 4.629112720489502}, {'label': 'hate', 'score': -4.079700946807861}]
[{'label': 'nothate', 'score': 0.9998348951339722}, {'label': 'hate', 'score': 0.00016509677516296506}]




In [18]:
text = "#Person 1# tells Tommy that he was terrible, dumb and stupid."
print(sentiment_pipe(text, **reward_logits_kwargs))
print(sentiment_pipe(text, **reward_probabilities_kwargs))

[{'label': 'hate', 'score': 0.9263184070587158}, {'label': 'nothate', 'score': -1.2262401580810547}]
[{'label': 'hate', 'score': 0.8959076404571533}, {'label': 'nothate', 'score': 0.10409237444400787}]


In [19]:
toxicity_evaluator = evaluate.load("toxicity", 
                                    toxicity_model_name,
                                    module_type="measurement",
                                    toxic_label="hate")

Downloading builder script:   0%|          | 0.00/6.08k [00:00<?, ?B/s]

In [20]:
toxicity_score = toxicity_evaluator.compute(predictions=[
    non_toxic_text
])

print("Toxicity score for non-toxic text:")
print(toxicity_score["toxicity"])

toxicity_score = toxicity_evaluator.compute(predictions=[
    toxic_text
])

print("\nToxicity score for toxic text:")
print(toxicity_score["toxicity"])

Toxicity score for non-toxic text:
[0.0036706042010337114]

Toxicity score for toxic text:
[0.7435290217399597]


In [21]:
toxicity_score = toxicity_evaluator.compute(predictions=[
    text
])

print("Toxicity score for text:")
print(toxicity_score["toxicity"])

Toxicity score for text:
[0.8959075212478638]


In [22]:
def evaluate_toxicity(model, 
                      toxicity_evaluator, 
                      tokenizer, 
                      dataset, 
                      num_samples):
    
    """
    Preprocess the dataset and split it into train and test parts.

    Parameters:
    - model (trl model): Model to be evaluated.
    - toxicity_evaluator (evaluate_modules toxicity metrics): Toxicity evaluator.
    - tokenizer (transformers tokenizer): Tokenizer to be used.
    - dataset (dataset): Input dataset for the evaluation.
    - num_samples (int): Maximum number of samples for the evaluation.
        
    Returns:
    tuple: A tuple containing two numpy.float64 values:
    - mean (numpy.float64): Mean of the samples toxicity.
    - std (numpy.float64): Standard deviation of the samples toxicity.
    """

    max_new_tokens=100

    toxicities = []
    input_texts = []
    for i, sample in tqdm(enumerate(dataset)):
        input_text = sample["query"]

        if i > num_samples:
            break
            
        input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids
        
        generation_config = GenerationConfig(max_new_tokens=max_new_tokens,
                                             tok_k=0.0,
                                             top_p=1.0,
                                             do_sample=True)

        response_token_ids = model.generate(input_ids=input_ids.to("cuda"),
                                            generation_config=generation_config)
        
        generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens=True)
        
        toxicity_score = toxicity_evaluator.compute(predictions=[(input_text + " " + generated_text)])

        toxicities.extend(toxicity_score["toxicity"])

    # Compute mean & std using np.
    mean = np.mean(toxicities)
    std = np.std(toxicities)
        
    return mean, std

In [23]:
print(dataset["test"][0]["query"])

Summarize the following conversation. #Person1#: I would like to order some internet today. #Person2#: What kind would you like? #Person1#: What kind of internet is there? #Person2#: You can get DEL or dial-up. #Person1#: Which of those two is best? #Person2#: I would recommend DEL. #Person1#: So that one better? #Person2#: It's better because it doesn't tie up the phone. #Person1#: What do you mean by that? #Person2#: DEL isn't connected through your phone line, but dial-up is. #Person1#: So then I can't use my phone if I'm on the internet? #Person2#: That's correct. With DEL you can do both. Summary: </s>


In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

input_text = dataset["test"][0]["query"]
input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids
        
generation_config = GenerationConfig(max_new_tokens=100,
                                             tok_k=0.0,
                                             top_p=1.0,
                                             do_sample=True)
response_token_ids = model.generate(input_ids=input_ids,
                                            generation_config=generation_config)
print(tokenizer.decode(response_token_ids[0], skip_special_tokens=True))

#Person1# is ordering dial-up Internet for her smartphone. #Person2# suggests DEL for Internet. #Person2# will connect through DEL to DEL.


In [25]:
input_text = dataset["test"][0]["query"]
input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids
        
generation_config = GenerationConfig(max_new_tokens=100,
                                             tok_k=0.0,
                                             top_p=1.0,
                                             do_sample=True)
response_token_ids = ref_model.generate(input_ids=input_ids,
                                            generation_config=generation_config)
print(tokenizer.decode(response_token_ids[0], skip_special_tokens=True))

#Person1# wants to order some internet. #Person2# recommends DEL as it doesn't tie up the phone.


In [26]:
input_text = dataset["test"][0]["query"]
input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids
        
generation_config = GenerationConfig(max_new_tokens=100,
                                             tok_k=0.0,
                                             top_p=1.0,
                                             do_sample=True)
response_token_ids = peft_model.generate(input_ids=input_ids,
                                            generation_config=generation_config)
print(tokenizer.decode(response_token_ids[0], skip_special_tokens=True))

#Person1# wants internet, so #Person2# recommends dial-up. #Person1# can see its features and talk on the phone if needed.


In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
ref_model = ref_model.to("cuda")
mean_before_detoxification, std_before_detoxification = evaluate_toxicity(model=ref_model, 
                                                                          toxicity_evaluator=toxicity_evaluator, 
                                                                          tokenizer=tokenizer, 
                                                                          dataset=dataset["test"], 
                                                                          num_samples=10)

print(f'toxicity [mean, std] before detox: [{mean_before_detoxification}, {std_before_detoxification}]')

11it [00:09,  1.15it/s]

toxicity [mean, std] before detox: [0.03593920476057313, 0.04408836551716423]





In [28]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

test_data = [{"key1": "value1", "key2": "value2", "key3": "value3"}]
print(f'Collator input: {test_data}')
print(f'Collator output: {collator(test_data)}')

Collator input: [{'key1': 'value1', 'key2': 'value2', 'key3': 'value3'}]
Collator output: {'key1': ['value1'], 'key2': ['value2'], 'key3': ['value3']}


In [29]:
learning_rate=1.41e-5
max_ppo_epochs=1
mini_batch_size=4
batch_size=16

config = PPOConfig(
    model_name=model_name,    
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

ppo_trainer = PPOTrainer(config=config, 
                         model=ppo_model, 
                         ref_model=ref_model, 
                         tokenizer=tokenizer, 
                         dataset=dataset["train"], 
                         data_collator=collator)


In [30]:
output_min_length = 100
output_max_length = 400
output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

reward_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # You want the raw logits without softmax.
    "batch_size": 16
}

max_ppo_steps = 10

for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    # Break when you reach max_steps.
    if step >= max_ppo_steps:
        break   

    prompt_tensors = batch["input_ids"]
    #prompt_tensors = prompt_tensors.to(device)

    # Get response from FLAN-T5/PEFT LLM.
    summary_tensors = []

    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()        
            
        generation_kwargs["max_new_tokens"] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)
        
        summary_tensors.append(summary.squeeze()[-max_new_tokens:])
        
    # This needs to be called "response".
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

    # Compute reward outputs.
    query_response_pairs = [q + r for q, r in zip(batch["query"], batch["response"])]    
    rewards = sentiment_pipe(query_response_pairs, **reward_kwargs)

    # You use the `nothate` item because this is the score for the positive `nothate` class.
    reward_tensors = [torch.tensor(reward[not_hate_index]["score"]) for reward in rewards]    

    # Run PPO step.
    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)
    
    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print('-'.join('' for x in range(100)))

0it [00:00, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
1it [00:14, 14.71s/it]

objective/kl: 23.681047439575195
ppo/returns/mean: -0.31194549798965454
ppo/policy/advantages_mean: -1.5952417165010502e-09
---------------------------------------------------------------------------------------------------


2it [00:29, 14.88s/it]

objective/kl: 26.74717903137207
ppo/returns/mean: -0.4546484351158142
ppo/policy/advantages_mean: 3.506745560599711e-09
---------------------------------------------------------------------------------------------------


3it [00:40, 13.21s/it]

objective/kl: 19.751705169677734
ppo/returns/mean: -0.18133071064949036
ppo/policy/advantages_mean: 5.389858337423448e-09
---------------------------------------------------------------------------------------------------


4it [00:51, 12.34s/it]

objective/kl: 18.141197204589844
ppo/returns/mean: 0.06754764169454575
ppo/policy/advantages_mean: 5.452392315419274e-09
---------------------------------------------------------------------------------------------------


5it [01:04, 12.33s/it]

objective/kl: 22.47735595703125
ppo/returns/mean: -0.1949140876531601
ppo/policy/advantages_mean: -8.762025771602566e-09
---------------------------------------------------------------------------------------------------


6it [01:16, 12.22s/it]

objective/kl: 23.169288635253906
ppo/returns/mean: -0.28153395652770996
ppo/policy/advantages_mean: 7.531006041006094e-09
---------------------------------------------------------------------------------------------------


7it [01:30, 12.85s/it]

objective/kl: 24.53217315673828
ppo/returns/mean: -0.3741832375526428
ppo/policy/advantages_mean: -9.450175753045187e-09
---------------------------------------------------------------------------------------------------


8it [01:43, 12.93s/it]

objective/kl: 24.927806854248047
ppo/returns/mean: -0.34466227889060974
ppo/policy/advantages_mean: -1.3630172546186259e-08
---------------------------------------------------------------------------------------------------


9it [01:55, 12.69s/it]

objective/kl: 24.788646697998047
ppo/returns/mean: -0.5018062591552734
ppo/policy/advantages_mean: 3.5212313065358103e-09
---------------------------------------------------------------------------------------------------


10it [02:07, 12.78s/it]

objective/kl: 23.9842529296875
ppo/returns/mean: -0.215382382273674
ppo/policy/advantages_mean: -2.0377657339309962e-09
---------------------------------------------------------------------------------------------------





In [31]:
#ppo_model = ppo_model.to("cpu")
ppo_model = ppo_model.to("cuda")
mean_after_detoxification, std_after_detoxification = evaluate_toxicity(model=ppo_model, 
                                                                        toxicity_evaluator=toxicity_evaluator, 
                                                                        tokenizer=tokenizer, 
                                                                        dataset=dataset["test"],
                                                                        #dataset=dataset["test"].to(device),
                                                                        num_samples=10)
print(f'toxicity [mean, std] after detox: [{mean_after_detoxification}, {std_after_detoxification}]')

11it [00:08,  1.25it/s]

toxicity [mean, std] after detox: [0.0344240732321685, 0.039543140829415444]





In [32]:
mean_improvement = (mean_before_detoxification - mean_after_detoxification) / mean_before_detoxification
std_improvement = (std_before_detoxification - std_after_detoxification) / std_before_detoxification

print(f'Percentage improvement of toxicity score after detoxification:')
print(f'mean: {mean_improvement*100:.2f}%')
print(f'std: {std_improvement*100:.2f}%')

Percentage improvement of toxicity score after detoxification:
mean: 4.22%
std: 10.31%


In [33]:
ppo_model = ppo_model.to(device)

batch_size = 20
compare_results = {}

df_batch = dataset["test"][0:batch_size]

compare_results["query"] = df_batch["query"]
prompt_tensors = df_batch["input_ids"]

summary_tensors_ref = []
summary_tensors = []

# Get response from ppo and base model.
for i in tqdm(range(batch_size)):
    gen_len = output_length_sampler()
    generation_kwargs["max_new_tokens"] = gen_len
    
    summary = ref_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device), 
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors_ref.append(summary)

    summary = ppo_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device), 
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors.append(summary)

# Decode responses.
compare_results["response_before"] = [tokenizer.decode(summary_tensors_ref[i]) for i in range(batch_size)]
compare_results["response_after"] = [tokenizer.decode(summary_tensors[i]) for i in range(batch_size)]

# Sentiment analysis of query/response pairs before/after.
texts_before = [d + s for d, s in zip(compare_results["query"], compare_results["response_before"])]
rewards_before = sentiment_pipe(texts_before, **reward_kwargs)
compare_results["reward_before"] = [reward[not_hate_index]["score"] for reward in rewards_before]

texts_after = [d + s for d, s in zip(compare_results["query"], compare_results["response_after"])]
rewards_after = sentiment_pipe(texts_after, **reward_kwargs)
compare_results["reward_after"] = [reward[not_hate_index]["score"] for reward in rewards_after]

100%|██████████| 20/20 [00:27<00:00,  1.36s/it]


In [34]:
pd.set_option('display.max_colwidth', 500)
df_compare_results = pd.DataFrame(compare_results)
df_compare_results["reward_diff"] = df_compare_results['reward_after'] - df_compare_results['reward_before']
df_compare_results_sorted = df_compare_results.sort_values(by=['reward_diff'], ascending=False).reset_index(drop=True)
df_compare_results_sorted

Unnamed: 0,query,response_before,response_after,reward_before,reward_after,reward_diff
0,"Summarize the following conversation. #Person1#: So how did you like the restaurant? #Person2#: Actually, it could have been better. #Person1#: What didn't you like about it? #Person2#: It is a new restaurant. I don't think they have their act together yet. #Person1#: What did you think about the food? #Person2#: I felt that the food was pretty mediocre. #Person1#: The service wasn't that great, either. #Person2#: I agree. The service was not good. #Person1#: Do you think that you want to tr...","<pad> #Person2# psychiatric tests were negative and there's a problem with the food, the service and the service. #Person2# thinks it's too expensive.</s>","<pad> If the restaurant is not remodeled, it might not be the best one and the service doesn't sympathize with #Person2#. Over 2 times I refused.</s>",1.487796,2.280482,0.792686
1,"Summarize the following conversation. #Person1#: Oh, my God! What's this? #Person2#: What? #Person1#: Look! This window is open. #Person2#: Did you open it before we left? #Person1#: Are you kidding? It's winter. Why would I open it? #Person2#: I don't know. Wait. Is this yours? #Person1#: No! Oh, my God! Someone has broken into the house. #Person2#: It looks that way. That's probably why the door wasn't locked when we came in. #Person1#: I locked it when I left though. #Person2#: Yes, but t...",<pad> Allen decides to fall and go to floor the house. There is someone previous it and he searches through your house because when he left through the door there was a robber.</s>,<pad> Allen notices that someone has broken in the window except for the office window. Allen gives Brian a phone to repair the window when he thinks of new door locks.</s>,1.826768,2.515347,0.688579
2,"Summarize the following conversation. #Person1#: Judy, what is everybody talking about? #Person2#: Haven't you heard? Richard was fired by our manager. #Person1#: You're kidding. It can't be true. #Person2#: Believe it or not. Everybody is talking about it in the company. #Person1#: Really? I'm surprised. #Person2#: Me too. Summary: </s>",<pad> Judy and #Person1# are surprised to see Richard's firing. It can't be serious since Richard was fired because everyone is talking about it.</s>,<pad> Judy and her colleagues are surprised she's told that Richard was fired.</s>,1.624579,2.221351,0.596772
3,"Summarize the following conversation. #Person1#: Today more and more families have personal computers. People have wider range of choice to communicate with the outside world. #Person2#: Right. With the establishment of Internet and a lot of web companies, people are getting more and more dependent on the web. #Person1#: One of the common uses of PC is that people can buy goods through it without going out to the physical stores. #Person2#: Can you tell me how it is done? #Person1#: If a cus...",<pad> No one has a phone but a computer and everyone can find their phone to buy the goods through the Internet. #Person1# shows how PC is to afford a whole many things to buy. It is a Chinese way and easy to use.</s>,<pad> The more and more families have personal computers because they can communicate with the outside world. People are getting more and more dependent on web because people can buy goods online without going out to the physical stores.</s>,2.150694,2.483355,0.332661
4,"Summarize the following conversation. #Person1#: Hello. I want to reconfirm our flight to London. #Person2#: Yes, sir. Did you call the airline? #Person1#: Yes, I did. But I couldn't communicate with them in English. They speak only Spanish. So I need your help. #Person2#: Certainly, sir. What is the flight number and when are you leaving? #Person1#: We are taking IB 385 to London tomorrow at 1 p. m. #Person2#: Oh, I see, sir. We have the airline office inside the hotel. They have an English...",<pad> #Person1# is completely confused. #Person2# tells him there can't be an English translation for #Person1# who wants to reconfirm a flight to London by IB 385. #Person2# phones 35 to confirm the flight number.</s>,<pad> #Person1# wants to confirm the flight area and going to London today. @362 is the service. #Person2# will call to confirm the flight number and phone number to confirm a flight.</s>,1.854966,2.168618,0.313652
5,"Summarize the following conversation. #Person1#: Let's take a coffee break, shall we? #Person2#: I wish I could, but I can't. #Person1#: What keeps you so busy? You've been sitting there for hours. You've got to walk around. You just can't stay on the computer forever. #Person2#: Well, I am up to my neck in work. I've got to finish this report. Sarah needs it by noon. I don't want to be scolded if I can't finish my work by the deadline. #Person1#: I understand that, but you'd feel better if ...","<pad> N as Lisa is in the office and demands a short break because her work, the long one, takes so long. #Person1# tries to help #Person2# take a coffee break but becomes tired.</s>",<pad> #Person1# intends to take a coffee break but #Person2# doesn't have enough space to do work.</s>,1.865129,2.169211,0.304082
6,"Summarize the following conversation. #Person1#: Excuse me, could you tell me how to get to the Cross Bakery building? #Person2#: The Cross Bakery building? Oh sure. You're actually walking in the opposite direction. #Person1#: Oh, you're kidding! I thought I was heading east. #Person2#: No, east is the other direction. To get to the Bakery, you need to turn around and go three blocks to Broadway. When you get to the intersection of Broadway and Elm, you hang a left. Go straight down that st...","<pad> #Person1# wants to take the second bus from Broadway to the Cross Bakery. #Person2# shows you the way, and #Person1# promises to show him the way.</s>","<pad> #Person1# wants to go to the Cross Bakery building from the famous Avenue bypass, but can't find the way. So, #Person2# shows #Person1# how to get to the Cross Bakery building.</s>",2.737304,3.029348,0.292044
7,"Summarize the following conversation. #Person1#: Could you help me, Sir? My flight got in 15 minutes ago. Everyone else has picked up the luggage but mine hasn't come through. #Person2#: I'm sorry, Madam, I'll go and find out if there is any more to come. Summary: </s>",<pad> #Person1#'s plane is delayed because nobody's plane questions everyone. #Person2#'ll check if there's still luggage left.</s>,"<pad> @nósfore those who have a canceled flights have got out on time, #Person1# hasn't gotten across to the through door spot.</s>",2.362151,2.541994,0.179843
8,"Summarize the following conversation. #Person1#: I'd like to have this cashed, please. #Person2#: Please put you name and address here. May I see your passport? #Person1#: Yes. #Person2#: How would you like it? #Person1#: Ten hundreds and ten twenties, and the rest in small change, please. #Person2#: OK. Here you are. Summary: </s>",<pad> #Person1# was sent the cash. #Person2# takes care of the cash and #1 would like £10 in small change</s>,<pad> #Person1# purchases a cashed for #Person1#. #Person1# accepts the payment in dollars and change.</s>,2.164172,2.298793,0.134621
9,"Summarize the following conversation. #Person1#: Mom, I just finished my paper. Can you proofread it before I hand it in? #Person2#: Sure, let's take a look. Sweetie, this is terrific. Your ideas are so original. #Person1#: Thanks. #Person2#: I can tell you worked hard on it. #Person1#: I really did! I started thinking about what I wanted to say three weeks ago. #Person2#: Well, it was definitely worth all the time. #Person1#: Let's just hope my teacher agrees. Summary: </s>",<pad> #Person1# got a piece of material ready. My mom is reading it and it is terrific. #Person1# knew it was worth all the time.</s>,"<pad> #Person1# so impressed by #Person1#'s paper, #Person2# checks #Person1#'s paper with a free C take.</s>",2.762697,2.894431,0.131734
