# Finetuning Text Summarization Notebook:

# Hugging Face T5 transformer & XSum dataset

In [1]:
from os.path import join, isfile
from os import listdir
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from rouge_score import rouge_scorer
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import  DataLoader, RandomSampler, SequentialSampler #Dataset,
from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import T5Tokenizer, T5ForConditionalGeneration

**FYI:** pd.__version__ '1.3.4' gets this error AttributeError: 'functools.partial' object has no attribute '__name__'
https://github.com/pandas-dev/pandas/issues/42748 . pip install pandas==1.2.5 removes error. 

## Loading data from Hugging Face 

In [2]:
from datasets import load_dataset
datasets = load_dataset('xsum')
# datasets = load_dataset('samsum')

df = datasets['train'].to_pandas()
# shortening data down for faster tutorial training 
df = df.iloc[:3000, :2].copy()
df.columns = ['ctext', 'text']
df.head()

Using custom data configuration default
Reusing dataset xsum (C:\Users\megra\.cache\huggingface\datasets\xsum\default\1.2.0\32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,ctext,text
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...


## Model building 

In [3]:
class T5Finetuner(pl.LightningModule):
    '''
    Documentation-In-Progress
    '''

    def __init__(self, df = pd.DataFrame):
        super().__init__()
        self.save_hyperparameters()
        self.source_len = 512
        self.summ_len = 200
        self.lr = .0001
        self.bs = 8
        self.num_workers = 8
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
        self.tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
        self.data = df
        self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.output = 'temp/'
        self.name = 'test'

    def encode_text(self, context, text):
        ctext = str(context) # context text 
        ctext = ' '.join(ctext.split())
        text = str(text) # summarized text
        text = ' '.join(text.split())
        source = self.tokenizer.batch_encode_plus([ctext], 
                                                max_length= self.source_len, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], 
                                                max_length= self.summ_len,
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
        y = target['input_ids']
        target_id = y[:, :-1].contiguous()
        target_label = y[:, 1:].clone().detach()
        target_label[y[:, 1:] == self.tokenizer.pad_token_id] = -100 #edge-case  handling when no labels are there
        return source['input_ids'], source['attention_mask'], target_id, target_label
    
    def prepare_data(self):
        source_ids, source_masks, target_ids, target_labels = [], [], [], [] 
        for _, row in self.data.iterrows():
            source_id, source_mask, target_id, target_label = self.encode_text(row.ctext, row.text)
            source_ids.append(source_id)
            source_masks.append(source_mask)
            target_ids.append(target_id)
            target_labels.append(target_label)

        # Transforming lists into tensors
        source_ids = torch.cat(source_ids, dim=0)
        source_masks = torch.cat(source_masks, dim=0)
        target_ids = torch.cat(target_ids, dim=0)
        target_labels = torch.cat(target_labels, dim=0)
        # Splitting data into standard train, val, and test sets 
        data = TensorDataset(source_ids, source_masks, target_ids, target_labels)
        train_size, val_size = int(0.8 * len(data)), int(0.1 * len(data))
        test_size = len(data) - (train_size + val_size)
        self.train_dat, self.val_dat, self.test_dat = \
            random_split(data, [train_size, val_size, test_size])
    
    def forward(self, batch, batch_idx):
        source_ids, source_mask, target_ids, target_labels = batch[:4]
        return self.model(
            input_ids = source_ids, 
            attention_mask = source_mask, 
            decoder_input_ids=target_ids, 
            labels=target_labels
        )
        
    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('train loss', loss, prog_bar = True, logger = True)
        return {'loss': loss, 'log': {'train_loss': loss}}

    def validation_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('valid loss', loss, prog_bar = True, logger = True)
        return {'loss': loss}

    def validation_epoch_end(self, outputs): 
        loss = sum([o['loss'] for o in outputs]) / len(outputs)
        out = {'val_loss': loss}
        return {**out, 'log': out}

    def test_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('test loss', loss, prog_bar = True, logger = True)
        return {'loss': loss}

    def test_epoch_end(self, outputs):
        loss = sum([o['loss'] for o in outputs]) / len(outputs)
        out = {'test_loss': loss}
        return {**out, 'log': out}
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dat,
            batch_size=self.bs,
            num_workers=self.num_workers, 
            sampler=RandomSampler(self.train_dat))

    def val_dataloader(self):
        return DataLoader(
            self.val_dat, 
            batch_size=self.bs, 
            num_workers=self.num_workers,
            sampler=SequentialSampler(self.val_dat)
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dat, 
            batch_size=self.bs, 
            num_workers=self.num_workers,
            sampler=SequentialSampler(self.test_dat)
        )    

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=self.lr, eps=1e-4)
        return {'optimizer': optimizer}
    
    def summarize(text): 
        text_encoding = tokenizer(
            text,
            max_length = 512, 
            padding = 'max_length', 
            truncation = True, 
            return_attention_mask = True, 
            add_special_characters = True, 
            return_tensors = 'pt'
        )
        generated_ids = trained_model.model.generate(
            input_ids=text_encoding['input_ids'], 
            attention_mask = text_encoding['attention_mask'], 
            max_length = 80,
            num_beams = 2,
            repetition_penalty = 2.5,
            length_penalty = 1.0,
            early_stopping = True
        )
        preds = [
            tokenizer.decode(gen_id, 
                skip_special_tokens = True, 
                clean_up_tokenization_spaces = True)
            for gen_id in generated_ids
        ]
        return "".join(preds)
        
    def save_core_model(self):
        store_path = join(self.output, self.name, 'core')
        self.model.save_pretrained(store_path)
        self.tokenizer.save_pretrained(store_path)

In [4]:
class MetricsCallback(pl.Callback):
    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)

## Selecting model name 

In [5]:
#######################
MODEL_NAME = 't5-base'
#######################
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5Finetuner(df = df)
num_epochs = 5

## Loading tensorboard for logging 

In [6]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs --host localhost --port 4000

In [7]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints', 
    filename = 'best-checkpoint', 
    save_top_k = 1, 
    verbose = True, 
    monitor = 'val_loss', 
    mode= 'min', 
)

logger = TensorBoardLogger('lightning_logs', name  = 'custom_summary_from_xsum_data') 

trainer = pl.Trainer(
    logger = logger, 
    checkpoint_callback = checkpoint_callback, 
    max_epochs = num_epochs, 
    gpus = 1, 
)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [8]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

  rank_zero_deprecation(


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

## Loading best model in training 

In [9]:
trained_model = T5Finetuner.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

In [10]:
trained_model.freeze()

## Inference 

In [11]:
def get_example(index = int):
    full_text = df['ctext'].values[index]
    summary = df['text'].values[index]
    return full_text, summary


example_index = 55

example_text, example_summary = get_example(example_index)
print('~~~ Original text: \n\n', example_text, '\n\n\n ~~~ Summary: \n\n', example_summary)

~~~ Original text: 

 North and his fellow Wales wing Alex Cuthbert scored the Lions's tries in the 23-21 win in Brisbane.
Australia would have won had replacement Kurtley Beale not failed with two late penalties.
"It was a remarkable win and a remarkable result," North said.
"It was an unbelievable feeling. We won and I got my first [Lions] Test try. It doesn't get much better than that.
"My heart was in my mouth at the end. I think everyone was feeling the same.
"It's always good to get a win, and while it was tough towards the end we are delighted to have got that first win in the series.
"We've spoken a lot about momentum these past few weeks, and that could be huge for us now heading into the second Test next week."
The Wallabies were leading 7-3 through Israel Folau's converted before the Lions hit back with a fine individual try from North.
The 21-year-old, making his Lions Test debut, effortlessly beat three players during a 60-metre run before crossing for his third try of the

In [12]:
# generating prediction from T5Finetuner method summarize 
prediction = T5Finetuner.summarize(example_text)
prediction

Keyword arguments {'add_special_characters': True} not recognized.


'denied his first Test try as Wales beat the Wallabies 23-21 in their first Test win of the season.'

In [13]:
#using RougeScorer to assess example 
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scorer.score(example_summary, prediction)

{'rouge1': Score(precision=0.25, recall=0.23809523809523808, fmeasure=0.24390243902439024),
 'rouge2': Score(precision=0.05263157894736842, recall=0.05, fmeasure=0.05128205128205128),
 'rougeL': Score(precision=0.2, recall=0.19047619047619047, fmeasure=0.1951219512195122)}