In [1]:
%pip install transformers
# T5Tokenizer requires the SentencePiece library
%pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [4]:
# define logging
import logging
LOG_FILE = f"./outputs/t5_finetuning_{np.datetime64('now')}"
logging.basicConfig(filename=LOG_FILE, filemode="w", encoding='utf-8', level=logging.DEBUG)

In [6]:
# import modules from huggingface
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

### Load data

In [46]:
INPUT_DIR = "./data/"
train_df = pd.read_csv(INPUT_DIR+"qg_train.csv", index_col=0)[:10000]
val_df = pd.read_csv(INPUT_DIR+"qg_dev.csv", index_col=0)[:500]

In [48]:
# for question generation task, prepend answer to sentence
train_df["input"] = "answer: "+train_df["answer"]+" context: "+train_df["context"]
val_df["input"] = "answer: "+val_df["answer"]+" context: "+val_df["context"]

In [49]:
train_df.head()

Unnamed: 0,question,sentence,answer,context,exact,input
0,When did Beyonce start becoming popular?,"Born and raised in Houston, Texas, she perform...",in the late 1990s,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,True,answer: in the late 1990s context: Beyoncé Gis...
1,What areas did Beyonce compete in when she was...,"Born and raised in Houston, Texas, she perform...",singing and dancing,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,True,answer: singing and dancing context: Beyoncé G...
2,When did Beyonce leave Destiny's Child and bec...,Their hiatus saw the release of Beyoncé's debu...,2003,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,True,answer: 2003 context: Beyoncé Giselle Knowles-...
3,In what city and state did Beyonce grow up?,"Born and raised in Houston, Texas, she perform...","Houston, Texas",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,True,"answer: Houston, Texas context: Beyoncé Gisell..."
4,In which decade did Beyonce become famous?,"Born and raised in Houston, Texas, she perform...",late 1990s,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,True,answer: late 1990s context: Beyoncé Giselle Kn...


In [50]:
logging.info(f"[Data]: Reading data...\n")

source_text = "input"
target_text = "question"

# reset index because CSV index is a mess
train_dataset = train_df.reset_index()[[source_text,target_text]]
val_dataset = val_df.reset_index()[[source_text,target_text]]

print(f"Training {train_dataset.shape} samples")
print(f"Test {val_dataset.shape} samples")

Training (11976, 2) samples
Test (500, 2) samples


### Define dataset class and functions

In [51]:
class QuestionSentenceDataset(Dataset):
  """
  Creating a custom dataset 
  """

  def __init__(self, data, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = data
    self.source_len = source_len
    self.out_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])
    
    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())

    # from text to ids
    source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_text], max_length= self.out_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long), 
        'source_mask': source_mask.to(dtype=torch.long), 
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long)
    }

In [52]:
def train(epoch, tokenizer, model, device, loader, optimizer):

  """
  Function for training

  """

  model.train()
  for _, data in enumerate(loader):
    y = data['target_ids'].to(device, dtype = torch.long)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    # padding of labels is done with a token with id -100
    # which is a special token automatically ignored by PyTorch loss functions
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100 
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
    loss = outputs[0]

    if _%10==0:
      print(f"Epoch {epoch}, Step {_}, Loss = {loss}")

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [53]:
def validate(epoch, tokenizer, model, device, loader):

  """
  Function to get predictions from model

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
          
          # get words from ids
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%10==0:
              print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)

  return predictions, actuals

In [54]:
def T5Trainer(train_dataset, val_dataset, source_text, target_text, model_params, output_dir="./outputs/" ):
  
  """
  T5 train and validate

  """

  # to be able to reproduce
  torch.manual_seed(model_params["SEED"])
  np.random.seed(model_params["SEED"])
  torch.backends.cudnn.deterministic = True

  logging.info(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # encode text
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

  # using T5 with language model layer
  model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  model = model.to(device)
  
  # create dataloaders
  train_qsd = QuestionSentenceDataset(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
  val_qsd = QuestionSentenceDataset(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)

  train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


  val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }

  train_loader = DataLoader(train_qsd, **train_params)
  val_loader = DataLoader(val_qsd, **val_params)

  # training loop
  optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])

  logging.info(f'[Initiating Fine Tuning]...\n')

  for epoch in range(model_params["TRAIN_EPOCHS"]):
      train(epoch, tokenizer, model, device, train_loader, optimizer)
      
      # checkpoint
      if epoch%10:
        cp_path = os.path.join(output_dir, f"checkpoint{epoch}")
        model.save_pretrained(cp_path)
      
  logging.info(f"[Saving Model]...\n")
  # save model, tokenizer and configs
  path = os.path.join(output_dir, "final")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)


  # evaluating test dataset
  logging.info(f"[Initiating Validation]...\n")
  for epoch in range(model_params["VAL_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'generated':predictions,'actual':actuals})
    final_df.to_csv(os.path.join(output_dir,'predictions.csv'))
  
  logging.info(f"[Validation Completed.]\n")
  print(f"""[Model] Model saved @ {os.path.join(output_dir, "checkpoints")}\n""")
  print(f"""[Validation] Generated questions saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
  print(f"""[Logs] Logs saved @ {LOG_FILE}\n""")

### Run

In [55]:
model_params={
    "MODEL":"t5-small",            # pretrained model
    "TRAIN_BATCH_SIZE":8,          # training batch size
    "VALID_BATCH_SIZE":8,          # validation batch size
    "TRAIN_EPOCHS":3,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-3,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":64,   # max length of source text
    "MAX_TARGET_TEXT_LENGTH":15,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 

}

In [56]:
T5Trainer(train_dataset, val_dataset, source_text="input", target_text="question", model_params=model_params, output_dir="./outputs/")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Epoch 0, Step 0, Loss = 5.642310619354248
Epoch 0, Step 10, Loss = 3.9199302196502686
Epoch 0, Step 20, Loss = 3.993957996368408
Epoch 0, Step 30, Loss = 4.337299823760986
Epoch 0, Step 40, Loss = 3.613666534423828
Epoch 0, Step 50, Loss = 3.673809289932251
Epoch 0, Step 60, Loss = 3.2799713611602783
Epoch 0, Step 70, Loss = 3.6796414852142334
Epoch 0, Step 80, Loss = 2.8729088306427
Epoch 0, Step 90, Loss = 3.1814584732055664
Epoch 0, Step 100, Loss = 3.1381988525390625
Epoch 0, Step 110, Loss = 3.5035743713378906
Epoch 0, Step 120, Loss = 2.7879228591918945
Epoch 0, Step 130, Loss = 3.412121057510376
Epoch 0, Step 140, Loss = 3.274005889892578
Epoch 0, Step 150, Loss = 2.9916586875915527
Epoch 0, Step 160, Loss = 2.828334093093872
Epoch 0, Step 170, Loss = 3.5482983589172363
Epoch 0, Step 180, Loss = 3.1731836795806885
Epoch 0, Step 190, Loss = 2.6266489028930664
Epoch 0, Step 200, Loss = 3.124974250793457
Epoch 0, Step 210, Loss = 3.5272293090820312
Epoch 0, Step 220, Loss = 2.50712

In [None]:
# TODO: use ROUGE or some metric to evaluate predictions
# TODO: check loss function
# TODO: do we preprocess/clean input?
# TODO: validation being used as test, write validation
# TODO: plot loss graphs