In [1]:
# !pip install --quiet pytorch-lightning
# !pip install --quiet transformers
# !pip install --quiet datasets
!pip install --quiet -r requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m720.6/720.6 kB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.6/149.6 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [1]:
from datasets import load_dataset, load_from_disk
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5TokenizerFast as T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW
from tqdm.auto import tqdm
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

  from .autonotebook import tqdm as notebook_tqdm


## Data

In [4]:
# xsum_data = load_dataset("xsum")
# xsum_data.save_to_disk('data/xsum')

# cnn_data = load_dataset("cnn_dailymail", "3.0.0")
# cnn_data.save_to_disk('data/cnn_dailymail')

# tldr_data = load_dataset("webis/tldr-17")
# tldr_data.save_to_disk('data/tldr')

In [7]:
data = load_from_disk('data/xsum')
# data = load_from_disk('data/cnn_dailymail')
# data = load_from_disk('data/tldr')

df_train = pd.DataFrame(data=data['train'])
df_val = pd.DataFrame(data=data['validation'])
df_test = pd.DataFrame(data=data['test'])

df_train.head()

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984


In [5]:
df_train.columns = ['text', 'summary', 'id']
df_val.columns = ['text', 'summary', 'id']
df_test.columns = ['text', 'summary', 'id']

In [7]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, text_max_len = 512, summary_max_len = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.text_max_len = text_max_len
        self.summary_max_len = summary_max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        text = self.data.iloc[idx]['text']
        summary = self.data.iloc[idx]['summary']

        text_encoding = self.tokenizer(
            text,
            max_length=self.text_max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        summary_encoding = self.tokenizer(
            summary,
            max_length=self.summary_max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        labels = summary_encoding['input_ids']
        labels[labels == 0] = -100

        return dict(
            text=text,
            summary=summary,
            text_input_ids=text_encoding['input_ids'],
            text_attention_mask=text_encoding['attention_mask'],
            labels=labels.flatten(),
            labels_attention_mask=summary_encoding['attention_mask']
        )

In [8]:
class CustomDataModule(pl.LightningDataModule):
    def __init__(self, df_train, df_val, df_test, tokenizer, batch = 8, text_max_len = 512, summary_max_len = 128):
        super().__init__()
        self.df_train = df_train
        self.df_val = df_val
        self.df_test = df_test
        self.tokenizer = tokenizer
        self.batch = batch
        self.text_max_len = text_max_len
        self.summary_max_len = summary_max_len

    def setup(self, stage=None):
      self.train_dataset = CustomDataset(self.df_train, self.tokenizer, self.text_max_len, self.summary_max_len)
      self.val_dataset = CustomDataset(self.df_val, self.tokenizer, self.text_max_len, self.summary_max_len)
      self.test_dataset = CustomDataset(self.df_test, self.tokenizer, self.text_max_len, self.summary_max_len)

    def train_dataloader(self):
      return DataLoader(self.train_dataset, batch_size=self.batch, shuffle=True, num_workers=2)

    def val_dataloader(self):
      return DataLoader(self.val_dataset, batch_size=self.batch, shuffle=True, num_workers=2)

    def test_dataloader(self):
      return DataLoader(self.test_dataset, batch_size=self.batch, shuffle=True, num_workers=2)

## T5

In [37]:
class T5SummaryModel(pl.LightningModule):
  
  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True)

  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
    
    output = self.model(input_ids, 
                        attention_mask=attention_mask, 
                        labels=labels, 
                        decoder_attention_mask=decoder_attention_mask
    )
    return output
  
  def training_step(self, batch, batch_idx):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']

    output = self(input_ids=input_ids, 
                         attention_mask=attention_mask, 
                         decoder_attention_mask=labels_attention_mask,
                         labels=labels
                         )
    self.log('train_loss', output.loss, prog_bar=True, logger=True)
    return output.loss

  def validation_step(self, batch, batch_idx):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']

    output = self(input_ids=input_ids, 
                         attention_mask=attention_mask, 
                         decoder_attention_mask=labels_attention_mask,
                         labels=labels
                         )
    self.log('val_loss', output.loss, prog_bar=True, logger=True)
    return output.loss
  
  def test_step(self, batch, batch_idx):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']

    output = self(input_ids=input_ids, 
                         attention_mask=attention_mask, 
                         decoder_attention_mask=labels_attention_mask,
                         labels=labels
                         )
    self.log('test_loss', output.loss, prog_bar=True, logger=True)
    return output.loss
  
  def configure_optimizers(self):
    return AdamW(self.parameters(), lr=0.0001)

In [42]:
N_EPOCHS = 3
BATCH_SIZE = 8

tokenizer = T5Tokenizer.from_pretrained('t5-base')
data_module = CustomDataModule(df_train, df_val, df_test, tokenizer, batch=BATCH_SIZE)
model = T5SummaryModel()

In [43]:
checkpoint_callback = ModelCheckpoint(dirpath="checkpoints", filename="best_checkpoints", save_top_k=1,
                                         verbose=True, monitor="val_loss", mode="min")

trainer = pl.Trainer(callbacks=checkpoint_callback, max_epochs=N_EPOCHS, accelerator="gpu", enable_progress_bar=True)

trainer.fit(model, data_module)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

ValueError: ignored

In [None]:
trained_model = T5SummaryModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
trained_model.freeze()

def summarize(text):
  text_encoding = tokenizer(text, max_lenght=512, padding="max_length", 
    truncation=True, return_attention_mask=True, add_special_tokens=True, return_tensors="pt"
  )

  generated_ids = trained_model.model.generate(
      input_ids = text_encoding['input_ids'],
      attention_mask=text_encoding['attention_mask'],
      max_length=150, 
      num_beams=2,
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True
  )

  predictions = [tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) 
                 for gen_id in generated_ids]

  return "".join(predictions)


## BART

## GPT2

In [15]:
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer=AutoTokenizer.from_pretrained('gpt2-medium')
model=AutoModelWithLMHead.from_pretrained('gpt2-medium', return_dict=True)

sequence = df_test['text'][0]
inputs=tokenizer.encode("summarize: " + sequence,return_tensors='pt', max_length=512, truncation=True)
output = model.generate(inputs, min_length=80, max_length=100)
summary=tokenizer.decode(output[0])



Downloading (…)lve/main/config.json: 100%|██████████| 718/718 [00:00<?, ?B/s] 
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 26.1MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 24.0MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 3.28MB/s]
Downloading model.safetensors: 100%|██████████| 1.52G/1.52G [01:05<00:00, 23.2MB/s]
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<?, ?B/s] 
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 512, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


In [16]:
print(summary, "  \n\n ", df_test['summary'][0])

summarize: Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.
The Welsh Government said more people than ever were getting help to address housing problems.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.
Prison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.
However, the same could not be said for men, the charity said, because issues which often affect them, such as post traumatic stress disorder or drug dependency, were often viewed as less of a priority.
Andrew Stevens, who works in Welsh prisons trying to secure housing for prison leavers, said the need for