# Finetuning Text Summarization Notebook:

# Hugging Face T5 transformer & XSum dataset

In [1]:
from os.path import join, isfile
from os import listdir
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from rouge_score import rouge_scorer
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import  DataLoader, RandomSampler, SequentialSampler #Dataset,
from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import T5Tokenizer, T5ForConditionalGeneration

**FYI:** pd.__version__ '1.3.4' gets this error AttributeError: 'functools.partial' object has no attribute '__name__'
https://github.com/pandas-dev/pandas/issues/42748 . pip install pandas==1.2.5 removes error. 

## Loading data from Hugging Face 

In [2]:
from datasets import load_dataset
datasets = load_dataset('xsum')
# datasets = load_dataset('samsum')

df = datasets['train'].to_pandas()

Using custom data configuration default
Reusing dataset xsum (C:\Users\megra\.cache\huggingface\datasets\xsum\default\1.2.0\32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
df.head()

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984


In [4]:
# shortening data down for faster tutorial training 
df = df.iloc[:10000, :2].copy()
df.columns = ['ctext', 'text']
df = df[['text', 'ctext']].copy()
print(df.shape)
df.head()

(10000, 2)


Unnamed: 0,text,ctext
0,Clean-up operations are continuing across the ...,"The full cost of damage in Newton Stewart, one..."
1,Two tourist buses have been destroyed by fire ...,A fire alarm went off at the Holiday Inn in Ho...
2,Lewis Hamilton stormed to pole position at the...,Ferrari appeared in a position to challenge un...
3,A former Lincolnshire Police officer carried o...,"John Edward Bates, formerly of Spalding, Linco..."
4,An armed man who locked himself into a room at...,Patients and staff were evacuated from Cerahpa...


In [5]:
df['ctext'].values[3]

'John Edward Bates, formerly of Spalding, Lincolnshire, but now living in London, faces a total of 22 charges, including two counts of indecency with a child.\nThe 67-year-old is accused of committing the offences between March 1972 and October 1989.\nMr Bates denies all the charges.\nGrace Hale, prosecuting, told the jury that the allegations of sexual abuse were made by made by four male complainants and related to when Mr Bates was a scout leader in South Lincolnshire and Cambridgeshire.\n"The defendant says nothing of that sort happened between himself and all these individuals. He says they are all fabricating their accounts and telling lies," said Mrs Hale.\nThe prosecutor claimed Mr Bates invited one 15 year old to his home offering him the chance to look at cine films made at scout camps but then showed him pornographic films.\nShe told the jury that the boy was then sexually abused leaving him confused and frightened.\nMrs Hale said: "The complainant\'s recollection is that on

In [6]:
df['text'].values[3]

'A former Lincolnshire Police officer carried out a series of sex attacks on boys, a jury at Lincoln Crown Court was told.'

## Model building 

In [7]:
class T5Finetuner(pl.LightningModule):
    '''
    Documentation-In-Progress
    '''

    def __init__(self, df = pd.DataFrame):
        super().__init__()
        self.save_hyperparameters()
        self.source_len = 512
        self.summ_len = 200
        self.lr = .0001
        self.bs = 8
        self.num_workers = 8
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
        self.tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
        self.data = df
        self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.output = 'temp/'
        self.name = 'test'

    def encode_text(self, context, text):
        ctext = str(context) # context text 
        ctext = ' '.join(ctext.split())
        text = str(text) # summarized text
        text = ' '.join(text.split())
        source = self.tokenizer.batch_encode_plus([ctext], 
                                                max_length= self.source_len, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], 
                                                max_length= self.summ_len,
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
        y = target['input_ids']
        target_id = y[:, :-1].contiguous()
        target_label = y[:, 1:].clone().detach()
        target_label[y[:, 1:] == self.tokenizer.pad_token_id] = -100 #edge-case  handling when no labels are there
        return source['input_ids'], source['attention_mask'], target_id, target_label
    
    def prepare_data(self):
        source_ids, source_masks, target_ids, target_labels = [], [], [], [] 
        for _, row in self.data.iterrows():
            source_id, source_mask, target_id, target_label = self.encode_text(row.ctext, row.text)
            source_ids.append(source_id)
            source_masks.append(source_mask)
            target_ids.append(target_id)
            target_labels.append(target_label)

        # Transforming lists into tensors
        source_ids = torch.cat(source_ids, dim=0)
        source_masks = torch.cat(source_masks, dim=0)
        target_ids = torch.cat(target_ids, dim=0)
        target_labels = torch.cat(target_labels, dim=0)
        # Splitting data into standard train, val, and test sets 
        data = TensorDataset(source_ids, source_masks, target_ids, target_labels)
        train_size, val_size = int(0.8 * len(data)), int(0.1 * len(data))
        test_size = len(data) - (train_size + val_size)
        self.train_dat, self.val_dat, self.test_dat = \
            random_split(data, [train_size, val_size, test_size])
    
    def forward(self, batch, batch_idx):
        source_ids, source_mask, target_ids, target_labels = batch[:4]
        return self.model(
            input_ids = source_ids, 
            attention_mask = source_mask, 
            decoder_input_ids=target_ids, 
            labels=target_labels
        )
        
    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('train loss', loss, prog_bar = True, logger = True)
        return {'loss': loss, 'log': {'train_loss': loss}}

    def validation_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('valid loss', loss, prog_bar = True, logger = True)
        return {'loss': loss}

    def validation_epoch_end(self, outputs): 
        loss = sum([o['loss'] for o in outputs]) / len(outputs)
        out = {'val_loss': loss}
        return {**out, 'log': out}

    def test_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('test loss', loss, prog_bar = True, logger = True)
        return {'loss': loss}

    def test_epoch_end(self, outputs):
        loss = sum([o['loss'] for o in outputs]) / len(outputs)
        out = {'test_loss': loss}
        return {**out, 'log': out}
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dat,
            batch_size=self.bs,
            num_workers=self.num_workers, 
            sampler=RandomSampler(self.train_dat)
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dat, 
            batch_size=self.bs, 
            num_workers=self.num_workers,
            sampler=SequentialSampler(self.val_dat)
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dat, 
            batch_size=self.bs, 
            num_workers=self.num_workers,
            sampler=SequentialSampler(self.test_dat)
        )    

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=self.lr, eps=1e-4)
        return {'optimizer': optimizer}
    
    def save_core_model(self):
        store_path = join(self.output, self.name, 'core')
        self.model.save_pretrained(store_path)
        self.tokenizer.save_pretrained(store_path)

In [8]:
class MetricsCallback(pl.Callback):
    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)

## Selecting model name 

In [9]:
#######################
MODEL_NAME = 't5-base'
#######################
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5Finetuner(df = df)
num_epochs = 5

## Loading tensorboard for logging 

In [10]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs --host localhost --port 7000

Reusing TensorBoard on port 7000 (pid 42984), started 1:28:47 ago. (Use '!kill 42984' to kill it.)

In [11]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints', 
    filename = 'best-checkpoint', 
    save_top_k = 1, 
    verbose = True, 
    monitor = 'val_loss', 
    mode= 'min', 
)

logger = TensorBoardLogger('lightning_logs', name  = 'fine_tuning_text_summarizer_xsumdata_v_0_1') 

trainer = pl.Trainer(
    logger = logger, 
    checkpoint_callback = checkpoint_callback, 
    max_epochs = num_epochs, 
    gpus = 1, 
)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [12]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

  rank_zero_deprecation(


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

## Loading best model in training 

In [13]:
trained_model = T5Finetuner.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

In [15]:
trained_model.freeze()

## Inference 

In [22]:
class Inference():
    def get_example(index = int):
        full_text = df['ctext'].values[index]
        summary = df['text'].values[index]
        return full_text, summary

    def summarize(trained_model, tokenizer, text): 
        text_encoding = tokenizer(
            text,
            max_length = 512, 
            padding = 'max_length', 
            truncation = True, 
            return_attention_mask = True, 
            return_tensors = 'pt'
        )
    #     generated_ids = trained_model.model.generate(
        generated_ids = trained_model.model.generate(
            input_ids=text_encoding['input_ids'], 
            attention_mask = text_encoding['attention_mask'], 
            max_length = 200,
            num_beams = 2,
            repetition_penalty = 2.5,
            length_penalty = 1.0,
    #         early_stopping = True
        )
        preds = [
            tokenizer.decode(gen_id, 
                skip_special_tokens = True, 
                clean_up_tokenization_spaces = True)
            for gen_id in generated_ids
        ]
        return "".join(preds)
        
example_index = 50

example_text, example_summary = Inference.get_example(example_index)
print('~~~ Original text: \n\n', example_text, '\n\n\n ~~~ Summary: \n\n', example_summary)

~~~ Original text: 

 On-loan striker Holman opened his account on his home debut with a fine half-volley before doubling his tally with a 12-yard finish.
Danny Wright then matched Holman's feat, heading home from James Rowe's corner before striking from six yards for a second-half double of his own.
Alex Wall snatched a consolation goal but the Robins secured their seventh win in eight in the National League.
Cheltenham remain second in the table, one point behind Forest Green Rovers, while Bromley slip to 14th having won just once in their last 11 games.
Cheltenham Town boss Gary Johnson told BBC Radio Gloucestershire:
Media playback is not supported on this device
"It's the best we've played for a little while. We've still been getting the results, but I enjoyed the way we played today - we created lots of chances.
"I was really pleased with our performance and hopefully our supporters can go home nice and happy.
"Our passing had a bit of an end product to it without going long. All

In [23]:
# generating prediction from T5Finetuner method summarize 
prediction = Inference.summarize(trained_model, tokenizer, example_text)
prediction

'Cheltenham beat Bromley 1-0 to move up to 14th in the National League table.'

In [24]:
#using RougeScorer to assess example 
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scorer.score(example_summary, prediction)

{'rouge1': Score(precision=0.2, recall=0.2, fmeasure=0.20000000000000004),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.13333333333333333, recall=0.13333333333333333, fmeasure=0.13333333333333333)}