In [1]:

!git clone https://github.com/nlp-with-transformers/notebooks.git
%cd notebooks
from install import *
install_requirements()

fatal: destination path 'notebooks' already exists and is not an empty directory.
/content/notebooks
⏳ Installing base requirements ...
✅ Base requirements installed!
⏳ Installing Git LFS ...
✅ Git LFS installed!


In [4]:

from utils import *
setup_chapter()

Using transformers v4.11.3
Using datasets v1.16.1


In [5]:

from transformers import pipeline, set_seed

# Summarization

In [6]:
!pip install -q transformers datasets
!pip install -q pytorch-lightning wandb
!pip install -U huggingface_hub

from datasets import load_dataset, Dataset
import pandas as pd
from transformers import pipeline, set_seed



## amazon reviews dataset

In [7]:

device = "cuda:0" if torch.cuda.is_available() else "cpu"

from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

df= pd.read_csv('My Drive/amazon_summary_data.csv')
df = df.sample(frac=0.005)
df = df[['review_headline','review_body']]
df = df.dropna()


dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

print(f"Features: {dataset['train'].column_names}")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive
Features: ['review_headline', 'review_body', '__index_level_0__']


In [8]:

torch.cuda.get_device_name()

'Tesla P100-PCIE-16GB'

In [9]:
sample = dataset["train"][111]
print(f"""
Review (excerpt of 500 characters, total length: {len(sample["review_body"])}):
""")
print(sample["review_body"][:500])
print(f'\nSummary (length: {len(sample["review_body"])}):')
print(sample["review_body"])


Review (excerpt of 500 characters, total length: 52):

Works well on my fine hair - has held up well so far

Summary (length: 52):
Works well on my fine hair - has held up well so far


## Text Summarization Pipelines

In [10]:
sample_text = dataset["train"][1]["review_body"][:2000]

summaries = {}

In [11]:

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
string = "The U.S. are a country. The U.N. is an organization."
sent_tokenize(string)

['The U.S. are a country.', 'The U.N. is an organization.']

### Summarization Baseline


In [13]:
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

In [14]:
summaries["baseline"] = three_sentence_summary(sample_text)
print(summaries)

{'baseline': "I was really excited to try this product, but it made me break
out.\nMy skin is fairly sensitive and I thought since it was all natural that
wouldn't happen."}


### PEGASUS

In [15]:
pipe = pipeline("summarization", model="sshleifer/distill-pegasus-cnn-16-4")
pipe_out = pipe(sample_text)
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

## Comparing Different Summaries

In [16]:
print("GROUND TRUTH")
print(dataset["train"][1]["review_body"])
print("")

for model_name,value in summaries.items():
    if (type(model_name)) == 'str':
      print(model_name.upper())
    print(summaries[model_name])
    print("")

GROUND TRUTH
I was really excited to try this product, but it made me break out. My skin is
fairly sensitive and I thought since it was all natural that wouldn't happen.

I was really excited to try this product, but it made me break out.
My skin is fairly sensitive and I thought since it was all natural that wouldn't
happen.

I was really excited to try this product, but it made me break out .



## Measuring the Quality of Generated Text

### ROUGE

In [17]:

!pip install rouge_score
from datasets import load_metric
rouge_metric = load_metric("rouge")



In [18]:
reference = dataset["train"][1]["review_body"]
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
    rouge_metric.add(prediction=summaries[model_name], reference=reference)
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
    records.append(rouge_dict)
pd.DataFrame.from_records(records, index=summaries.keys())

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,1.0,1.0,1.0,1.0
pegasus,0.622222,0.604651,0.622222,0.622222


## Evaluating PEGASUS on the Amazon reviews dataset

In [19]:

import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

rouge_metric = load_metric("rouge", cache_dir=None)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

In [20]:
def evaluate_summaries_baseline(dataset, metric,
                                column_text="review_body", 
                                column_summary="review_headline"):
    summaries = [three_sentence_summary(text) for text in dataset[column_text]]
    metric.add_batch(predictions=summaries,
                     references=dataset[column_summary])    
    score = metric.compute()
    return score

In [21]:
test_sampled = dataset['train'].shuffle(seed=42).select(range(1000))

score = evaluate_summaries_baseline(test_sampled, rouge_metric)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.149107,0.109273,0.144468,0.145842


In [22]:
from tqdm import tqdm
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="review_body", 
                               column_summary="review_headline"):
    review_body_batches = list(chunks(dataset[column_text], batch_size))
    review_headline_batches = list(chunks(dataset[column_summary], batch_size))

    for review_body_batch, review_headline_batch in tqdm(
        zip(review_body_batches, review_headline_batches), total=len(review_body_batches)):
        
        inputs = tokenizer(review_body_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=review_headline_batch)
        
    score = metric.compute()
    return score

In [23]:

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "sshleifer/distill-pegasus-cnn-16-4"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to('cuda:0')
score = evaluate_summaries_pegasus(test_sampled, rouge_metric, 
                                   model, tokenizer, batch_size=8)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

100%|██████████| 125/125 [03:58<00:00,  1.91s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.118573,0.071239,0.113156,0.11259


In [24]:

pd.DataFrame(rouge_dict, index=["pegasus"])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.118573,0.071239,0.113156,0.11259


## Training a Summarization Model

In [25]:

split_lengths = [len(dataset[split])for split in dataset]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset['train'].column_names}")
print("\nReview:")
print(dataset["test"][0]["review_body"])
print("\nSummary:")
print(dataset["test"][0]["review_headline"])

Split lengths: [2301, 256]
Features: ['review_headline', 'review_body', '__index_level_0__']

Review:
Given as a gift. Brought back many nice memories of times in the 60's.

Summary:
Brought back many nice memories of times in the 60's


In [26]:

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset['train'].column_names}")
print("\nReview:")
print(dataset["test"][0]["review_body"])
print("\nSummary:")
print(dataset["test"][0]["review_headline"])

Split lengths: [2301, 256]
Features: ['review_headline', 'review_body', '__index_level_0__']

Review:
Given as a gift. Brought back many nice memories of times in the 60's.

Summary:
Brought back many nice memories of times in the 60's


### Evaluating PEGASUS on Amazon Reviews dataset

In [27]:
pipe_out = pipe(dataset["test"][0]["review_body"])
print("Summary:")
print(pipe_out[0]["summary_text"].replace(" .<n>", ".\n"))

Summary:
Brought back many nice memories of times in the 60's .


In [28]:

score = evaluate_summaries_pegasus(dataset["test"], rouge_metric, model,
                                   tokenizer, column_text="review_body",
                                   column_summary="review_headline", batch_size=8)

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

100%|██████████| 32/32 [01:05<00:00,  2.06s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.136605,0.086825,0.131605,0.132558


In [29]:

pd.DataFrame(rouge_dict, index=["pegasus"])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.136605,0.086825,0.131605,0.132558


### Fine-Tuning PEGASUS

In [31]:

def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["review_body"], max_length=1024,
                                truncation=True)
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["review_headline"], max_length=128,
                                     truncation=True)
    
    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

dataset = dataset.map(convert_examples_to_features, 
                                       batched=True)
columns = ["input_ids", "labels", "attention_mask"]
dataset.set_format(type="torch", columns=columns)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [32]:

#id teacher-forcing
#alt Decoder input and label alignemt for text generation.
#caption Decoder input and label alignemt for text generation.
text = ['PAD','Transformers', 'are', 'awesome', 'for', 'text', 'summarization']
rows = []
for i in range(len(text)-1):
    rows.append({'step': i+1, 'decoder_input': text[:i+1], 'label': text[i+1]})
pd.DataFrame(rows).set_index('step')

Unnamed: 0_level_0,decoder_input,label
step,Unnamed: 1_level_1,Unnamed: 2_level_1
1,[PAD],Transformers
2,"[PAD, Transformers]",are
3,"[PAD, Transformers, are]",awesome
4,"[PAD, Transformers, are, awesome]",for
5,"[PAD, Transformers, are, awesome, for]",text
6,"[PAD, Transformers, are, awesome, for, text]",summarization


In [33]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model.to('cuda:0'))

In [34]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='/gdrive/MyDrive/pegasus', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10, push_to_hub=True,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16, report_to="wandb")

In [35]:

from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [39]:

trainer = Trainer(model=model.to(device), args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset["train"], 
                  eval_dataset=dataset["test"])

/gdrive/MyDrive/pegasus is already a clone of https://huggingface.co/sumedh/pegasus. Make sure you pull the latest changes with `repo.git_pull()`.


In [40]:
import torch
torch.cuda.is_available()

True

In [42]:

%pip install wandb
import wandb
%cd /gdrive
wandb.init(dir='/gdrive/MyDrive/wandb/',settings=wandb.Settings(start_method="thread"))

/gdrive


[34m[1mwandb[0m: Currently logged in as: [33msumedhkhodke[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
trainer.train()
score = evaluate_summaries_pegasus(
    dataset["test"], rouge_metric, trainer.model, tokenizer,
    batch_size=2, column_text="review_body", column_summary="review_headline")

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=[f"pegasus"])

In [None]:
# hide_input
pd.DataFrame(rouge_dict, index=[f"pegasus"])

In [None]:
# hide_output
trainer.push_to_hub("Training complete!")

### Generating summaries of reviews

In [None]:
# hide
import transformers
transformers.logging.set_verbosity_error()

In [None]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
sample_text = dataset["test"][0]["review_body"]
reference = dataset["test"][0]["review_headline"]
pipe = pipeline("summarization", model="sshleifer/distill-pegasus-cnn-16-4")

print("Review:")
print(sample_text)
print("\nReference Summary:")
print(reference)
print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])