# Train summarization model

**In this notebook..**
*   Create **dataset**-class
*   Train **summarization** model



In [1]:
!pip install transformers datasets

Installing collected packages: xxhash, dill, multiprocess, huggingface-hub, datasets
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.20.3
    Uninstalling huggingface-hub-0.20.3:
      Successfully uninstalled huggingface-hub-0.20.3
Successfully installed datasets-2.19.1 dill-0.3.8 huggingface-hub-0.23.0 multiprocess-0.70.16 xxhash-3.4.1


All imports

In [2]:
from transformers import BartForConditionalGeneration, BartTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from datasets import load_dataset
from torch.optim import AdamW
from tqdm import tqdm
from torch.nn import DataParallel

Load model and tokenizer

In [3]:
model_checkpoint = "facebook/bart-base"

In [4]:
model = BartForConditionalGeneration.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [5]:
tokenizer = BartTokenizer.from_pretrained(model_checkpoint)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Let's create Dataset class

In [6]:
class CNNDailymailDataset(Dataset):
  def __init__(self, dataset, tokenizer):
    self.dataset = dataset
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    example = self.dataset[idx] # Take from dataset
    article = example["article"] # Split into article
    highlights = example["highlights"] # And summary

    input_encoding = self.tokenizer.encode_plus( # Tokenize Inputs
        article,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    label_encoding = self.tokenizer.encode_plus( # Tokenize summary
        highlights,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    input_ids = input_encoding.input_ids.squeeze() # Squeeze all
    attention_mask = input_encoding.attention_mask.squeeze()
    labels = label_encoding.input_ids.squeeze()

    return { # And return
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [8]:
train_dataset = load_dataset("cnn_dailymail", "3.0.0") # Object

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [9]:
train_dataset = train_dataset["train"]

In [10]:
train_cnnds = CNNDailymailDataset(train_dataset, tokenizer)

In [11]:
train_dl = DataLoader(train_cnnds, batch_size=4, num_workers=8, shuffle=True)



In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
optimizer = AdamW(model.parameters(), lr=1e-6)
num_epochs = 1

Wrap the model with DataParallel, to parallelize training process

In [14]:
model.train()
model = DataParallel(model)

Let's Train model

In [15]:
for epoch in range(num_epochs): # For epoch..
    total_loss = 0
    progress_bar = tqdm(train_dl, desc=f"Epoch: {epoch+1}/{num_epochs}")

    for batch in progress_bar: # Take each batch from dataset
        optimizer.zero_grad() # Clear gradients of optimizer

        input_ids = batch["input_ids"].to(device) # 'Unzip' our batch
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        # Get model predictions
        loss = outputs.loss # Get losses

        loss.backward() # Step
        optimizer.step() # Back propogation

        total_loss += loss.item()
        progress_bar.set_postfix({"Loss": loss.item(), "Avg Loss": total_loss / (progress_bar.n + 1)})

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}") # Inform us about ended epoch

  self.pid = os.fork()
  self.pid = os.fork()
Epoch: 1/1: 100%|██████████| 71779/71779 [4:19:27<00:00,  4.61it/s, Loss=0.302, Avg Loss=0.311]

Epoch 1/1, Loss: 0.3016640841960907





In [16]:
model.module.save_pretrained("summarization_model")
tokenizer.save_pretrained("summarization_tokenizer")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('summarization_tokenizer/tokenizer_config.json',
 'summarization_tokenizer/special_tokens_map.json',
 'summarization_tokenizer/vocab.json',
 'summarization_tokenizer/merges.txt',
 'summarization_tokenizer/added_tokens.json')

In [7]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
# Path to ONNX-model
onnx_model_path = '/content/summarization_model'

# Destination path Google Drive
destination_path = '/content/drive/MyDrive/Summarization/summarization_model'

# Copy ONNX-model to Google Drive
shutil.copytree(onnx_model_path, destination_path)

'/content/drive/MyDrive/Summarization/summarization_model'

In [19]:
# Path to ONNX-model
onnx_model_path = '/content/summarization_tokenizer'

# Destination path Google Drive
destination_path = '/content/drive/MyDrive/Summarization/summarization_tokenizer'

# Copy ONNX-model to Google Drive
shutil.copytree(onnx_model_path, destination_path)

'/content/drive/MyDrive/Summarization/summarization_tokenizer'

# In conclusion: in this notebook i have trained model for summarization.