<a href="https://colab.research.google.com/github/kowal789/b/blob/main/Kowal_Summarization_Xsum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Summarization

we try to summarize on the [xsum](https://huggingface.co/datasets/knkarthick/xsum) dataset .

In [1]:
!pip install sentencepiece
!pip install 'transformers[torch]'
!pip install datasets
!pip install zstandard
!pip install sacrebleu
!pip install rouge_score

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
[31mERROR: Operation cancelled by user[0m[31m
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m629.4 kB/s[0m eta [36m0:00:00[0m
[31mERROR: Operation cancelled by user[0m[31m
[0mCollecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
from datasets import load_dataset, DatasetDict
#!pip install tensorflow_probability==0.12.2
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from datasets import load_metric
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Load the XSum dataset
ds = load_dataset("knkarthick/xsum", split=None)

# Access each split directly
train_set = ds['train']
valid_set = ds['validation']
test_set = ds['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/480M [00:00<?, ?B/s]

In [None]:
ds

In [None]:
ds["train"][1]

In [None]:
sample_text = ds["train"][1]["dialogue"][:2000]
sample_text


#Generating a baseline summary taking the first 3 sentences

In [None]:
def get_baseline_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    return ' '.join(sentences[:num_sentences])

In [None]:
summaries = {}

In [None]:
summaries["baseline"] = get_baseline_summary(sample_text)

### GPT2
Can implement a summarization by using the text-generation pipeline but appending a "TL:DR" at the end of the prompt

In [None]:
pipe = pipeline("text-generation", model="gpt2") #If gpt2-xl is too large use a small version
gpt2_query = sample_text + "\nTL;DR:\n"
pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)
summaries["gpt2"] = "\n".join(sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))
summaries["gpt2"]

### T5

In [None]:
pipe = pipeline("summarization", model="t5-small") #If t5-large is too large use t5-small version
pipe_out = pipe(sample_text)
summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))
summaries["t5"]

## BART

In [None]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))
summaries["bart"]

In [None]:
print("GROUND TRUTH")
print(ds["train"][1]["summary"])
print("")

for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])
    print("")

## Evaluation

How can we evaluate the performance of a summarization model?

**BLEU (Bilingual Evaluation Understudy) Score**  is a precision based metric that measures how many words or n-grams present in the machine generated summaries also appear in the human reference summaries divided by the length of the generation. It penalises the appearance of too short summaries with a brevity penalty. [Sacrebleu](https://huggingface.co/spaces/evaluate-metric/sacrebleu).

**ROUGE (Recall Oriented Understudy for Gisting Evaluation) score** is the recall based metric that measures how many words/ngrams present in the human reference summaries were found in the machine generated summaries. Some sub-forms:

1. ROUGE-N: Overlap of n-grams
2. ROUGE-1: Overlap of unigrams.
3. ROUGE-2: Oerlap of bigrams.
4. ROUGE-L: Overlap of the Longest Common Subsequence

To Do:
1. How do you interpret a Bleu/ Rouge Score of 0?

In [None]:
bleu_metric = load_metric("sacrebleu")

In [None]:
rouge_metric = load_metric("rouge")

In [None]:
reference = ds["train"][1]["summary"]
reference

In [None]:
bleu_scores = []
for model_name in summaries:
  bleu_metric.add(prediction=summaries[model_name], reference=[reference])
  results = bleu_metric.compute(smooth_method="floor", smooth_value=0)
  bleu_scores.append(results["score"])
  #results["precisions"] = [np.round(p, 2) for p in results["precisions"]]

In [None]:

records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
    rouge_metric.add(prediction=summaries[model_name], reference=reference)
    rouge_score = rouge_metric.compute()
    result_dict = dict((rn, rouge_score[rn].mid.fmeasure) for rn in rouge_names)
    bleu_metric.add(prediction=summaries[model_name], reference=[reference])
    bleu_results = bleu_metric.compute(smooth_method="floor", smooth_value=0)
    result_dict["bleu"] = bleu_results["score"]
    records.append(result_dict)
pd.DataFrame.from_records(records, index=summaries.keys())

BASELINE: some overlap of individual words, but bad performance on word pairs and no overlap (BLEU)

GPT2: captured some bigrams (rouge2) but the bleu score indicates still no overlaps

T5: best capturing in unigrams, but still no real performance in higher overlaps

## Finetuning a model for Summarization

#Discussion
How would you fine tune a model for summarization?

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Truncation of input to 1024 characters and 161 target summary size due to the training data has 95% of data below 161 characters here.



In [None]:
def convert_data_to_features(batch, tokenizer, max_input_length=1024, max_target_length=161):
    input_encodings = tokenizer(batch['dialogue'], padding="max_length", truncation=True, max_length=max_input_length)
    target_encodings = tokenizer(batch['summary'], padding="max_length", truncation=True, max_length=max_target_length)

    return {
    "input_ids": input_encodings.input_ids,
    "attention_mask": input_encodings.attention_mask,
    "labels": target_encodings.input_ids
    }
    #with tokenizer.as_target_tokenizer(): #as_target_tokenizer helps to differentiate between
    #    target_encodings = tokenizer(batch["summary"], max_length=max_target_length, truncation=True)

    #return {"input_ids": input_encodings["input_ids"],
    #        "attention_mask": input_encodings["attention_mask"],
    #        "labels": target_encodings["input_ids"]}

In [None]:
# Filter out rows where either 'dialogue' or 'summary' is None
filtered_ds = ds.filter(lambda x: x['dialogue'] is not None and x['summary'] is not None)

# Applying the convert_data_to_features function to the filtered dataset
dataset_xsum = filtered_ds.map(lambda batch: convert_data_to_features(batch, tokenizer), batched=True)
columns = ["input_ids", "labels", "attention_mask"]
dataset_xsum.set_format(type="torch", columns=columns)


In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='output_folder', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10, push_to_hub=False,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16)

In [None]:
trainer = Trainer(model=model, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_xsum["train"],
                  eval_dataset=dataset_xsum["validation"])

In [None]:
trainer.train()

In [None]:
def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score

In [None]:
rouge_score = evaluate_summaries(ds["test"], rouge_metric, trainer.model, tokenizer,batch_size=2)

In [None]:
rouge_dict = dict((rn, rouge_score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=[f"cnn"])

## Todo
How would you calculate the BLEU score?