## Installing the required packages

In [None]:
!pip install transformers --quiet
!pip install sentencepiece --quiet
!pip install datasets --quiet
!pip install sacrebleu --quiet

## Importing required libraries

In [42]:
import pandas as pd
import os
import torch
import time
import warnings
warnings.filterwarnings('ignore')

from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
from transformers.optimization import Adafactor, AdamW
from datasets import load_metric
from sacrebleu.metrics import BLEU
from IPython.display import HTML, display

In [None]:
# Use this when working on Google Colab
#from google.colab import drive
#drive.mount('/conent/drive')

## Load the Pre-trained model T5 and the tokenizer

In [47]:
# Check GPU availability
if torch.cuda.is_available():
    dev = torch.device("cuda:0")
    print("Running on the GPU")
else:
    dev = torch.device("cpu")
    print("Running on the CPU")

# Instantiate a T5 small model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model_t5_small = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)

# Instantiate a T5 base model
#tokenizer = T5Tokenizer.from_pretrained('t5-base')
#model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True)

#moving the model to device(GPU/CPU)
model_t5_small.to(dev)

Running on the CPU


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

## Preprocessing

In [34]:
# As for Web NLG and E2E the datasets are already available as csv. For Abstract Meaning Representation (AMR), the official web page only provides a text file
# ,so we process this file to extract the meaning representations and the target sentences and save the results as csv

with open('data/amr/amr-bank-struct-v3.0.txt') as file:
    lines = file.readlines()
    lines = [line.rstrip() for line in lines]

meaning_representations_not_flattened = list(filter(None, [line if not line.__contains__("#") else [] for line in lines]))
target_sentences = list(filter(None, [line[8:] if line.__contains__("# ::snt") else [] for line in lines]))
meaning_representations = []

for i in range(len(meaning_representations_not_flattened)):
    if meaning_representations_not_flattened[i][0] == "(":
        j = i+1
        while meaning_representations_not_flattened[j][0] != "(":
            j +=1
            if j == len(meaning_representations_not_flattened): break
        meaning_representations.append(''.join(map(str, meaning_representations_not_flattened[i:j])).replace(' ', ''))

# As for Web NLG and E2E the train/test split is roughly 90/10, so we also use this split for AMR
pd.DataFrame(list(zip(meaning_representations[:1404], target_sentences[:1404])), columns=['input_text','target_text']).to_csv('data/amr/abstract_meaning_representation_train.csv', index=False)
pd.DataFrame(list(zip(meaning_representations[1404:], target_sentences[1404:])), columns=['input_text','target_text']).to_csv('data/amr/abstract_meaning_representation_test.csv', index=False)

In [38]:
# Load the datasets for the Web NLG 2020 challenge
train_data_web_nlg = pd.read_csv('data/web_nlg/train/webNLG2020_train.csv')
test_data_web_nlg = pd.read_csv('data/web_nlg/test/webNLG2020_test.csv')

# Load the datasets for the Meaning Representation E2E challenge
train_data_e2e = pd.read_csv('data/e2e/train/trainset.csv')
test_data_e2e = pd.read_csv('data/e2e/test/testset_w_refs.csv')

# Load the datasets for the Abstract Meaning Representation AMR challenge
train_data_amr = pd.read_csv('data/amr/abstract_meaning_representation_train.csv')
test_data_amr = pd.read_csv('data/amr/abstract_meaning_representation_test.csv')

In [39]:
# Trimming off and sampling the last 5 datapoints from Web NLG so hat a batch would not leave any remainder.
train_data_web_nlg = train_data_web_nlg.iloc[:35200,:].sample(frac=1)
test_data_web_nlg = test_data_web_nlg.iloc[:1720,:].sample(frac=1)

# Trimming off and samplig the last few datapoints from E2E so that a batch would not leave any remainder.
train_data_e2e = train_data_e2e.iloc[:len(train_data_e2e)-1,:].sample(frac=1)
test_data_e2e = test_data_e2e.iloc[:len(test_data_e2e)-5,:].sample(frac=1)

# Trimming off and samplig the last few datapoints from AMR so that a batch would not leave any remainder.
train_data_amr = train_data_amr.iloc[:len(train_data_amr)-4,:].sample(frac=1)
test_data_amr = test_data_amr.iloc[:len(test_data_amr)-6,:].sample(frac=1)

In [40]:
# Set the batch size and the number of training epochs
batch_size = 8
number_of_batches_train_web_nlg = int(len(train_data_web_nlg)/batch_size)
number_of_batches_test_web_nlg = int(len(test_data_web_nlg)/batch_size)

number_of_batches_train_e2e = int(len(train_data_e2e)/batch_size)
number_of_batches_test_e2e = int(len(test_data_e2e)/batch_size)

number_of_batches_train_amr = int(len(train_data_amr)/batch_size)
number_of_batches_test_amr = int(len(test_data_amr)/batch_size)

epochs = 1

print('--- Number of train batches Web NLG: ' + str(number_of_batches_train_web_nlg) + ' --- ')
print('--- Number of test  batches Web NLG: ' + str(number_of_batches_test_web_nlg) + '  --- \n')

print('--- Number of train batches E2E : ' + str(number_of_batches_train_e2e) + ' --- ')
print('--- Number of test  batches E2E : ' + str(number_of_batches_test_e2e) + '  --- \n')

print('--- Number of train batches AMR : ' + str(number_of_batches_train_amr) + ' --- ')
print('--- Number of test  batches AMR : ' + str(number_of_batches_test_amr) + '  --- ')

--- Number of train batches Web NLG: 4400 --- 
--- Number of test  batches Web NLG: 215  --- 

--- Number of train batches E2E : 5257 --- 
--- Number of test  batches E2E : 586  --- 

--- Number of train batches AMR : 175 --- 
--- Number of test  batches AMR : 19  --- 


In [45]:
def create_list_of_batches(batch_size, num_batches, data, challenge_name):
# Create List of batches for inputs and labels
    inputs = []
    labels = []
    for i in range(num_batches):
        input_batch=[]
        label_batch=[]
        for index,row in data[i*batch_size:i*batch_size+batch_size].iterrows():
          input_batch.append('WebNLG: '+row['input_text']+'</s>' if challenge_name == 'WebNLG' else 'E2E: '+row['input_text']+'</s>' if challenge_name == 'E2E' else 'AMR: ' + row['input_text']+'</s>')
          label_batch.append(row['target_text']+'</s>')

        input_batch=tokenizer.batch_encode_plus(input_batch,padding=True,max_length=400,return_tensors='pt')["input_ids"]
        label_batch=tokenizer.batch_encode_plus(label_batch,padding=True,max_length=400,return_tensors="pt") ["input_ids"]

        input_batch=input_batch.to(dev)
        label_batch=label_batch.to(dev)

        inputs.append(input_batch)
        labels.append(label_batch)
    return inputs, labels

inputs_train_web_nlg, labels_train_web_nlg = create_list_of_batches(batch_size=batch_size, num_batches=number_of_batches_train_web_nlg, data=train_data_web_nlg, challenge_name='WebNLG')
inputs_test_web_nlg, labels_test_web_nlg = create_list_of_batches(batch_size=batch_size, num_batches=number_of_batches_test_web_nlg, data=test_data_web_nlg, challenge_name='WebNLG')

inputs_train_e2e, labels_train_e2e = create_list_of_batches(batch_size=batch_size, num_batches=number_of_batches_train_e2e, data=train_data_e2e, challenge_name='E2E')
inputs_test_e2e, labels_test_e2e = create_list_of_batches(batch_size=batch_size, num_batches=number_of_batches_test_e2e, data=test_data_e2e, challenge_name='E2E')

inputs_train_amr, labels_train_amr = create_list_of_batches(batch_size=batch_size, num_batches=number_of_batches_train_amr, data=train_data_amr, challenge_name='AMR')
inputs_test_amr, labels_test_amr = create_list_of_batches(batch_size=batch_size, num_batches=number_of_batches_test_amr, data=test_data_amr, challenge_name='AMR')

## Set the Optimizer with Parameter values suggested for T5

In [48]:
optimizer = Adafactor(
    model_t5_small.parameters(),
    lr=1e-3,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False
)

## Training Routine

In [49]:
def progress(loss,value, max=100):
    return HTML(""" Batch loss :{loss}
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(loss=loss,value=value, max=max))

def trainer(model, num_batches, inputs, labels):
    # Set the model in training mode
    model.train()

    loss_per_10_steps=[]
    for epoch in range(1,num_batches+1):
      print('Running epoch: {}'.format(epoch))
      running_loss=0

      out = display(progress(1, num_batches+1), display_id=True)
      for i in range(num_batches):

        # clear out the gradients of all Variables
        optimizer.zero_grad()

        # Forward propogation
        outputs = model(input_ids=inputs[i], labels=labels[i])
        loss = outputs.loss
        loss_num=loss.item()
        logits = outputs.logits
        running_loss+=loss_num
        if i%10 == 0:
          loss_per_10_steps.append(loss_num)
        out.update(progress(loss_num,i, num_batches+1))

        # calculating the gradients
        loss.backward()

        #updating the params
        optimizer.step()

      running_loss=running_loss/int(num_batches)
      print('Epoch: {} , Running loss: {}'.format(epoch,running_loss))
    return model

# Train a T5 small model on Web NLG
model_t5_small = trainer(model=model_t5_small, num_batches=number_of_batches_train_web_nlg, inputs=inputs_train_web_nlg, labels=labels_train_web_nlg)

Running epoch: 1


KeyboardInterrupt: 