In [None]:
!pip install -q transformers rouge-score accelerate evaluate datasets


## Import the dependencies

In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW, get_linear_schedule_with_warmup, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm import tqdm

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
import torch.nn.functional as F
from accelerate import Accelerator
import evaluate
import warnings
warnings.filterwarnings('ignore')


In [None]:
from datasets import load_dataset

In [None]:
import nltk
nltk.download('punkt')

from nltk import sent_tokenize

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Loading the data

In [None]:
data = load_dataset("cnn_dailymail", "3.0.0")

In [None]:
def show_samples(data, num_samples=3, seed=42):
    sample = data["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Article: {example['article']}'")
        print(f"'>> Summary: {example['highlights']}'")

show_samples(data, num_samples=3, seed=42)


'>> Article: By . Anthony Bond . PUBLISHED: . 07:03 EST, 2 March 2013 . | . UPDATED: . 08:07 EST, 2 March 2013 . Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide poisoning from a cooker. Tragic: The inquests have opened into the deaths of three members of the same family who were found in their static caravan last weekend. John and Audrey Cook are pictured . Awful: The family died following carbon monoxide poisoning at this caravan at the Tremarle Home Park in Camborne, Cornwall . It is also believed there was no working carbon monoxide detec

## Loading the pre-trained model and preprocessing the data

In [None]:

tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [None]:
batch_size = 8

max_input_length = 512
max_target_length = 128

In [None]:
def preprocess(examples):
  inputs = tokenizer(examples["article"], max_length = max_input_length, truncation = True)
  labels = tokenizer(examples["highlights"], max_length = max_target_length, truncation = True)
  inputs["labels"] = labels["input_ids"]
  return inputs

dataset = data.map(preprocess, batched = True)

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

In [None]:
dataset = dataset.remove_columns(data["train"].column_names)

In [None]:
dataset.set_format("torch")

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)


In [None]:
train_dataloader = DataLoader(
    dataset["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)


val_dataloader = DataLoader(
    dataset["validation"],
    collate_fn=data_collator,
    batch_size=batch_size
)

In [None]:

rouge_score = evaluate.load("rouge")

## Function for computing the outputs.

In [None]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge_score.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## Training

In [None]:
gcs_path = "gs://t5m-s/Summarizer/T5_Model" # Path for the end point to store the model weighs.

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir = gcs_path,
    overwrite_output_dir = True,
    num_train_epochs = 8,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    warmup_steps = 0,
    weight_decay = 0.01,
    learning_rate = 3e-5,
    predict_with_generate = True,
    fp16 = True,
    push_to_hub = False,
)



In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.0058,1.82093,0.247,0.1174,0.2039,0.2039,18.9992
2,1.9949,1.80037,0.2469,0.117,0.2036,0.2036,18.9995
3,1.948,1.793804,0.2477,0.1176,0.2047,0.2047,18.9999
4,1.9459,1.788438,0.2478,0.1182,0.2049,0.2049,18.9999
5,1.924,1.784401,0.2477,0.1179,0.2045,0.2046,18.9996
6,1.9301,1.782364,0.2477,0.1179,0.2044,0.2044,18.9999
7,1.9284,1.780812,0.2474,0.1177,0.2044,0.2045,18.9999
8,1.9217,1.779498,0.2473,0.1174,0.2041,0.2042,18.9999


TrainOutput(global_step=287120, training_loss=1.9475382236492405, metrics={'train_runtime': 29355.572, 'train_samples_per_second': 78.244, 'train_steps_per_second': 9.781, 'total_flos': 3.1086704005093786e+17, 'train_loss': 1.9475382236492405, 'epoch': 8.0})

In [None]:
trainer.evaluate()

{'eval_loss': 1.779497742652893,
 'eval_rouge1': 0.2473,
 'eval_rouge2': 0.1174,
 'eval_rougeL': 0.2041,
 'eval_rougeLsum': 0.2042,
 'eval_gen_len': 18.9999,
 'eval_runtime': 434.4614,
 'eval_samples_per_second': 26.447,
 'eval_steps_per_second': 3.308,
 'epoch': 8.0}

## Uploading the model to Hugging Face hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.28k [00:00<?, ?B/s]

'https://huggingface.co/dheeraj-kj/T5_Model/tree/main/'

## Inference

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="dheeraj-kj/T5_Model")


In [None]:
text = "Artificial intelligence (AI) is a branch of computer science that aims to create machines capable of intelligent behavior. It involves the development of algorithms and models that enable computers to perform tasks that typically require human intelligence. AI applications can be found in various fields, including healthcare, finance, education, and entertainment.\
One prominent area of AI is machine learning, where systems learn from data to improve their performance on a specific task. Deep learning, a subset of machine learning, involves neural networks with many layers, allowing the model to automatically learn hierarchical representations. This has led to significant advancements in image and speech recognition, natural language processing, and other domains.\
While AI has shown remarkable progress, ethical considerations and responsible development are crucial. Issues related to bias in algorithms, privacy concerns, and the impact on employment are topics that researchers and policymakers are actively addressing. As AI continues to evolve, the need for ethical guidelines and regulations becomes increasingly important to ensure its positive impact on society."

In [None]:
summarizer(text)

[{'summary_text': 'Artificial intelligence is a branch of computer science that aims to create machines capable of intelligent behavior . It involves the development of algorithms that enable computers to perform tasks that typically require human intelligence .'}]