# Summarization

### Loading Libraries

In [1]:
# Numerical 
import numpy as np
#
import pandas as pd

# Natural Language Toolkit
import nltk
from nltk.tokenize import sent_tokenize

# Datasets
from datasets import load_dataset
from datasets import load_metric

# Transformers
import transformers
from transformers import pipeline, set_seed
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Model Metric's Evaluation 
import evaluate

# Data Visualization
import matplotlib.pyplot as plt

# Progress bar & Iterations Management
from tqdm import tqdm

# PyTorch
import torch

# HuggingFace Hub
from huggingface_hub import notebook_login

### Retrieving Data

In [2]:
# Uncomment and run this cell if you're on Colab or Kaggle
!git clone https://github.com/nlp-with-transformers/notebooks.git
%cd notebooks

from install import *
install_requirements(is_chapter6=True)

Cloning into 'notebooks'...
remote: Enumerating objects: 526, done.[K
remote: Counting objects: 100% (173/173), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 526 (delta 143), reused 135 (delta 126), pack-reused 353[K
Receiving objects: 100% (526/526), 28.62 MiB | 15.25 MiB/s, done.
Resolving deltas: 100% (250/250), done.
/Users/isisromero/Desktop/NLP_transformer/chap_06/notebooks
⏳ Installing base requirements ...


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


✅ Base requirements installed!
⏳ Installing Git LFS ...
✅ Git LFS installed!


In [3]:
from utils import *
setup_chapter()

No GPU was detected! This notebook can be *very* slow without a GPU 🐢
Using transformers v4.40.1
Using datasets v2.19.0


### Retrieiving Dataset

In [5]:
# CNN/DailyTime dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")
print(f"Features: {dataset['train'].column_names}")

Features: ['article', 'highlights', 'id']


In [9]:
sample = dataset["train"][1]

print(f"""
Article (excerpt of 500 characters, total length: {len(sample["article"])}):
""")

print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])


Article (excerpt of 500 characters, total length: 4051):

Editor's note: In our Behind the Scenes series, CNN correspondents share their
experiences in covering news and analyze the stories behind the events. Here,
Soledad O'Brien takes users inside a jail where many of the inmates are mentally
ill. An inmate housed on the "forgotten floor," where many mentally ill inmates
are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the
Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here,
inmates with the most s

Summary (length: 281):
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .


### Text Summarization Pipelines

In [10]:
sample_text = dataset["train"][1]["article"][:2000]

summaries = {}

In [11]:
# Abbreviation & Punctuation Tool
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/isisromero/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
string = "The U.S. are a country. The U.N. is an organization."
sent_tokenize(string)

['The U.S. are a country.', 'The U.N. is an organization.']

#### Summarization Baseline

In [13]:
# Baseline for Summarizing as follow:
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

In [14]:
summaries["baseline"] = three_sentence_summary(sample_text)

#### GPT-2

In [15]:
# Recreating the summarization procedure with 'pipeline() function'
set_seed(42)

pipe = pipeline("text-generation", model="gpt2-xl")
gpt2_query = sample_text + "\nTL;DR:\n"
pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)

summaries["gpt2"] = "\n".join(
    sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

#### T5

In [16]:
# Directly loading with T5 'pipeline() function':
pipe = pipeline("summarization", model="t5-large")
pipe_out = pipe(sample_text)

summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

#### BART

In [17]:
# Bart procedure as follow:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)

summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

#### PEGASUS

In [19]:
import google.protobuf
print(google.protobuf.__version__)

5.26.1


In [20]:
# PEGASUS procedure as follow:
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")
pipe_out = pipe(sample_text)

summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .", ".\n")

#### Comparing Different Summaries

In [21]:
print("GROUND TRUTH")
print(dataset["train"][1]["highlights"])
print("")

for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])
    print("")

GROUND TRUTH
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .

BASELINE
Editor's note: In our Behind the Scenes series, CNN correspondents share their
experiences in covering news and analyze the stories behind the events.
Here, Soledad O'Brien takes users inside a jail where many of the inmates are
mentally ill. An inmate housed on the "forgotten floor," where many mentally ill
inmates are housed in Miami before trial.
MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention
facility is dubbed the "forgotten floor."

GPT2
1)      The mentally ill inmates will not commit heinous crimes if they're out
of jail
2)      There are mentally ill inmates on the ninth floor
4)      Miami-Dade corrections workers will throw a mentally ill person i

### Measuring the Quality of Generated Text

#### BLEU

In [23]:
# Warning: load_metric is deprecated, thus, I've decided to use a current approach to perform the
# model evaluation with 'evaluate'

# bleu_metric = load_metric("sacrebleu")

In [24]:
bleu_metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [25]:
bleu_metric.add(
    prediction="the the the the the the", reference=["the cat is on the mat"])

results = bleu_metric.compute(smooth_method="floor", smooth_value=0)
results["precisions"] = [np.round(p, 2) for p in results["precisions"]]
pd.DataFrame.from_dict(results, orient="index", columns=["Value"])

Unnamed: 0,Value
score,0.0
counts,"[2, 0, 0, 0]"
totals,"[6, 5, 4, 3]"
precisions,"[33.33, 0.0, 0.0, 0.0]"
bp,1.0
sys_len,6
ref_len,6


In [26]:
bleu_metric.add(
    prediction="the cat is on mat", reference=["the cat is on the mat"])

results = bleu_metric.compute(smooth_method="floor", smooth_value=0)
results["precisions"] = [np.round(p, 2) for p in results["precisions"]]
pd.DataFrame.from_dict(results, orient="index", columns=["Value"])

Unnamed: 0,Value
score,57.893007
counts,"[5, 3, 2, 1]"
totals,"[5, 4, 3, 2]"
precisions,"[100.0, 75.0, 66.67, 50.0]"
bp,0.818731
sys_len,5
ref_len,6


### ROUGE

In [28]:
rouge_metric = evaluate.load("rouge")

In [31]:
reference = dataset["train"][1]["highlights"]
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
    rouge_metric.add(prediction=summaries[model_name], reference=reference)
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
    records.append(rouge_dict)

pd.DataFrame.from_records(records, index=summaries.keys())

##### AttributeError with mid:

1) Direct Access to Values: Since the ROUGE scores are returned as float values, I adjusted the code to access these values directly. This eliminates the need for extracting values using keys like "fmeasure", simplifying the data handling.

2) Dictionary Comprehension: I used dictionary comprehension to create a dictionary (rouge_dict) for each summary, where each entry corresponds to a ROUGE metric. The keys are the names of the ROUGE metrics, and the values are the respective scores directly obtained from the score dictionary.

3) DataFrame Creation: We create a pandas DataFrame from the dictionaries (records) list. Each dictionary represents the ROUGE scores for a different model's summary. The DataFrame is indexed by the keys of the summaries dictionary, which are the model names. This structure facilitates an easy comparison of ROUGE scores across different summarization models.

In [37]:
reference = dataset["train"][1]["highlights"]
# summaries = {"model1": "the cat is on the mat", "model2": "a cat sat on a mat"}
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
records = []

for model_name in summaries:
    rouge_metric.add(prediction=summaries[model_name], reference=reference)
    score = rouge_metric.compute()
    rouge_dict = {rn: score[rn] for rn in rouge_names if rn in score}
    records.append(rouge_dict)

# Crear un DataFrame de pandas con los registros, usando los nombres de los modelos como índices
pd.DataFrame.from_records(records, index=summaries.keys())

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.365079,0.145161,0.206349,0.285714
gpt2,0.301887,0.057692,0.188679,0.245283
t5,0.382979,0.130435,0.255319,0.382979
bart,0.475248,0.222222,0.316832,0.415842
pegasus,0.316832,0.20202,0.277228,0.316832


### Evaluating PEGASUS on the CNN/DailyMail Dataset

In [41]:
dataset = load_dataset("cnn_dailymail", "3.0.0")

rouge_metric = evaluate.load("rouge", cache_dir=None)

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

##### AdjustHere’sfor Evaluating Summaries with ROUGE Metrics

The following adjustments have been made to ensure the proper handling and evaluation of summarization models using the ROUGE metrics from the 🤗 Evaluate library:

1. **Direct Score Access:** The ROUGE metrics returned by `evaluate.load(“rouge”)` are direct float values representing the F-measure. I’ve adjusted the code to use these float values directly without accessing nested attributes like `.mid. measure`, which is not applicable to “e her.”

2. Simplified DataFrame Creation: The scores obtained from the ROUGE evaluation are directly used to create a pandas DataFrame. This DataFrame organizes the ROUGE-1, ROUGE-2, ROUGE-L, and ROUGE-Lsum scores in a structured format, making it a breeze to view and analyze the performance of the summarization models. This straightforward process ensures ease of use and understanding for all users.

3. **Summary Function Integration:** The evaluation function, `evaluate_summaries_baseline,` integrates a summarization function (assumed to be `three_sentence_summary`) that generates summaries from the provided text. This function is crucial for creating the input predictions for the ROUGE metric evaluation.

4. **Data Handling:** I ensure that the dataset is adequately shuffled and sampled before evaluation to provide a randomized subset for analysis, enhancing the reliability of the evaluation metrics across different runs.

In [44]:
# def evaluate_summaries_baseline(dataset, metric,
#                                 column_text="article", 
#                                 column_summary="highlights"):
#     summaries = [three_sentence_summary(text) for text in dataset[column_text]]
#     metric.add_batch(predictions=summaries,
#                      references=dataset[column_summary])    
#     score = metric.compute()
#     return score

def evaluate_summaries_baseline(dataset,
                                metric,
                                column_text="article",
                                column_summary="highlights"):
    summaries = [three_sentence_summary(text) for text in dataset[column_text]]
    metric.add_batch(predictions=summaries, references=dataset[column_summary])
    score = metric.compute()
    return score

In [45]:
# test_sampled = dataset["test"].shuffle(seed=42).select(range(1000))

# score = evaluate_summaries_baseline(test_sampled, rouge_metric)
# rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
# pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T


test_sampled = dataset["test"].shuffle(seed=42).select(range(1000))
score = evaluate_summaries_baseline(test_sampled, rouge_metric)

# Creating a dictionary from scores without using '.mid.fmeasure'
rouge_dict = {rn: score[rn] for rn in rouge_names}
pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.389276,0.171296,0.245061,0.354239


In [46]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [47]:
def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    score = metric.compute()
    return score

In [50]:
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
score = evaluate_summaries_pegasus(test_sampled, rouge_metric, 
                                   model, tokenizer, batch_size=8)

rouge_dict = {rn: score[rn] for rn in rouge_names}

pd.DataFrame(rouge_dict, index=["pegasus"])

### Training a Summarization Model

In [52]:
dataset_samsum = load_dataset("samsum")
split_lengths = [len(dataset_samsum[split])for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")
print(dataset_samsum["test"][0]["dialogue"])
print("\nSummary:")
print(dataset_samsum["test"][0]["summary"])

In [53]:
print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")
print(dataset_samsum["test"][0]["dialogue"])
print("\nSummary:")
print(dataset_samsum["test"][0]["summary"])

### Evaluating PEGASUS on SAMSum

In [54]:
pipe_out = pipe(dataset_samsum["test"][0]["dialogue"])

print("Summary:")
print(pipe_out[0]["summary_text"].replace(" .", ".\n"))

In [55]:
score = evaluate_summaries_pegasus(dataset_samsum["test"], rouge_metric, model,
                                   tokenizer, column_text="dialogue",
                                   column_summary="summary", batch_size=8)

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

pd.DataFrame(rouge_dict, index=["pegasus"])

In [56]:
pd.DataFrame(rouge_dict, index=["pegasus"])

### Fine-Tuning PEGASUS

In [58]:
d_len = [len(tokenizer.encode(s)) for s in dataset_samsum["train"]["dialogue"]]
s_len = [len(tokenizer.encode(s)) for s in dataset_samsum["train"]["summary"]]

fig, axes = plt.subplots(1, 2, figsize=(10, 3.5), sharey=True)
axes[0].hist(d_len, bins=20, color="C0", edgecolor="C0")
axes[0].set_title("Dialogue Token Length")
axes[0].set_xlabel("Length")
axes[0].set_ylabel("Count")
axes[1].hist(s_len, bins=20, color="C0", edgecolor="C0")
axes[1].set_title("Summary Token Length")
axes[1].set_xlabel("Length")
plt.tight_layout()
plt.show()

In [60]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["dialogue"], max_length=1024,
                                truncation=True)
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["summary"], max_length=128,
                                     truncation=True)
    
    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, 
                                       batched=True)
columns = ["input_ids", "labels", "attention_mask"]
dataset_samsum_pt.set_format(type="torch", columns=columns)

In [61]:
text = ['PAD','Transformers', 'are', 'awesome', 'for', 'text', 'summarization']
rows = []
for i in range(len(text)-1):
    rows.append({'step': i+1, 'decoder_input': text[:i+1], 'label': text[i+1]})

    pd.DataFrame(rows).set_index('step')

In [62]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [63]:
training_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10, push_to_hub=True,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16)

In [64]:
notebook_login()

In [65]:
trainer = Trainer(model=model, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"], 
                  eval_dataset=dataset_samsum_pt["validation"])

In [66]:
trainer.train()
score = evaluate_summaries_pegasus(
    dataset_samsum["test"], rouge_metric, trainer.model, tokenizer,
    batch_size=2, column_text="dialogue", column_summary="summary")

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=[f"pegasus"])

In [67]:
pd.DataFrame(rouge_dict, index=[f"pegasus"])

In [68]:
trainer.push_to_hub("Training complete!")

### Generating Dialogue Summaries

In [70]:
transformers.logging.set_verbosity_error()

In [69]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
sample_text = dataset_samsum["test"][0]["dialogue"]
reference = dataset_samsum["test"][0]["summary"]
pipe = pipeline("summarization", model="transformersbook/pegasus-samsum")

print("Dialogue:")
print(sample_text)
print("\nReference Summary:")
print(reference)
print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

In [71]:
custom_dialogue = """\
Thom: Hi guys, have you heard of transformers?
Lewis: Yes, I used them recently!
Leandro: Indeed, there is a great library by Hugging Face.
Thom: I know, I helped build it ;)
Lewis: Cool, maybe we should write a book about it. What do you think?
Leandro: Great idea, how hard can it be?!
Thom: I am in!
Lewis: Awesome, let's do it together!
"""
print(pipe(custom_dialogue, **gen_kwargs)[0]["summary_text"])