## Installing the required packages

In [None]:
!pip install transformers --quiet
!pip install sentencepiece --quiet
!pip install datasets --quiet
!pip install sacrebleu --quiet

## Importing required libraries

In [1]:
import pandas as pd
import os
import torch
import time
import warnings
warnings.filterwarnings('ignore')

from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
from transformers.optimization import Adafactor, AdamW
from datasets import load_dataset, load_metric
from sacrebleu.metrics import BLEU
from IPython.display import HTML, display

In [None]:
# Use this when working on Google Colab
#from google.colab import drive
#drive.mount('/conent/drive')

## Load the Pre-trained model T5 and the tokenizer

In [19]:
# Check GPU availability
if torch.cuda.is_available():
    dev = torch.device("cuda:0")
    print("Running on the GPU")
else:
    dev = torch.device("cpu")
    print("Running on the CPU")

# Instantiate a T5 small model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)

# Instantiate a T5 base model
#tokenizer = T5Tokenizer.from_pretrained('t5-base')
#model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True)

#moving the model to device(GPU/CPU)
model.to(dev)

Running on the CPU


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

## Preprocessing

In [6]:
# Load the datasets
dataset = load_dataset('web_nlg', 'webnlg_challenge_2017')
train_data = pd.read_csv('../data/web_nlg/train/webNLG2020_train.csv')
test_data = pd.read_csv('../data/web_nlg/test/webNLG2020_test.csv')

Reusing dataset web_nlg (/Users/furkansimsek/.cache/huggingface/datasets/web_nlg/webnlg_challenge_2017/0.0.0/28ffb892f7f42450dd9558684aa43bcaf44b1b3bf0d77cb8d73534646af88dda)


  0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
print(dataset['train']['original_triple_sets'][6000]['otriple_set'])
print(len(dataset['train']['original_triple_sets']))

[['Adisham_Hall | location | Sri_Lanka', 'Adisham_Hall | architecturalStyle | Tudor_Revival_architecture', 'Adisham_Hall | buildingEndDate | "1931"', 'Adisham_Hall | buildingStartDate | "1927"', 'Adisham_Hall | address | "St. Benedict\'s Monastery, Adisham, Haputhale, Sri Lanka"@en']]
6940


In [8]:
# Trimming off the last 5 datapoints so hat a batch would not leave any remainder.
train_data = train_data.iloc[:35200,:]
test_data = test_data.iloc[:1720,:]

In [9]:
# Sample the data
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)

In [18]:
# Set the batch size and the number training epochs
batch_size = 8
number_of_batches_train = int(len(train_data)/batch_size)
number_of_batches_test = int(len(test_data)/batch_size)
epochs = 1
print('--- Number of train batches: --- ' + str(number_of_batches_train))
print('--- Number of test batches: --- ' + str(number_of_batches_test))

--- Number of train batches: --- 4400
--- Number of test batches: --- 215


In [22]:
# Create List of batches for train inputs and labels
inputs_train = []
labels_train = []

for i in range(number_of_batches_train):
    input_batch=[]
    label_batch=[]
    for indx,row in train_data[i*batch_size:i*batch_size+batch_size].iterrows():
      input = 'WebNLG: '+row['input_text']+'</s>'
      labels = row['target_text']+'</s>'
      input_batch.append(input)
      label_batch.append(labels)
    input_batch=tokenizer.batch_encode_plus(input_batch,padding=True,max_length=400,return_tensors='pt')["input_ids"]
    label_batch=tokenizer.batch_encode_plus(label_batch,padding=True,max_length=400,return_tensors="pt") ["input_ids"]

    input_batch=input_batch.to(dev)
    label_batch=label_batch.to(dev)

    inputs_train.append(input_batch)
    labels_train.append(label_batch)

4400


In [23]:
# Create List of batches for train inputs and labels
inputs_test = []
labels_test = []

for i in range(number_of_batches_test):
    input_batch=[]
    label_batch=[]
    for indx,row in test_data[i*batch_size:i*batch_size+batch_size].iterrows():
      input = 'WebNLG: '+row['input_text']+'</s>'
      labels = row['target_text']+'</s>'
      input_batch.append(input)
      label_batch.append(labels)
    input_batch=tokenizer.batch_encode_plus(input_batch,padding=True,max_length=400,return_tensors='pt')["input_ids"]
    label_batch=tokenizer.batch_encode_plus(label_batch,padding=True,max_length=400,return_tensors="pt") ["input_ids"]

    input_batch=input_batch.to(dev)
    label_batch=label_batch.to(dev)

    inputs_test.append(input_batch)
    labels_test.append(label_batch)

215


## Set the Optimizer with Parameter values suggested for T5

In [25]:
optimizer = Adafactor(
    model.parameters(),
    lr=1e-3,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False
)

## Training Routine

In [27]:
def progress(loss,value, max=100):
    return HTML(""" Batch loss :{loss}
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(loss=loss,value=value, max=max))

# Set the module in training mode
model.train()

loss_per_10_steps=[]
for epoch in range(1,number_of_batches_train+1):
  print('Running epoch: {}'.format(epoch))
  running_loss=0

  out = display(progress(1, number_of_batches_train+1), display_id=True)
  for i in range(number_of_batches_train):

    # clear out the gradients of all Variables
    optimizer.zero_grad()

    # Forward propogation
    outputs = model(input_ids=inputs_train[i], labels=labels_train[i])
    loss = outputs.loss
    loss_num=loss.item()
    logits = outputs.logits
    running_loss+=loss_num
    if i%10 == 0:
      loss_per_10_steps.append(loss_num)
    out.update(progress(loss_num,i, number_of_batches_train+1))

    # calculating the gradients
    loss.backward()

    #updating the params
    optimizer.step()

  running_loss=running_loss/int(number_of_batches_train)
  print('Epoch: {} , Running loss: {}'.format(epoch,running_loss))

Running epoch: 1


KeyboardInterrupt: 