This notebook is for training a T5 model on the WebNLG dataset. 

In [1]:
# !conda install transformers
# !conda install sentencepiece

In [2]:
# Misc 
import urllib.request
import zipfile
import glob
import os
import shutil
import math

# Data Science
import pandas as pd
import xml.etree.ElementTree as ET
import torch
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration, Adafactor

# Visualization 
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import HTML, display


# admin
sns.set_style('darkgrid')


In [3]:
#################################
# Helper Functions 
#################################

def download_and_extract_dataset(url, save_zip_as, extract_to):
    urllib.request.urlretrieve(url, save_zip_as)
    with zipfile.ZipFile(save_zip_as, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def process_dataset(files):
    triple_re = re.compile('(\d)triples')
    data_dct = {}
    for file in files:
        tree = ET.parse(file)
        root = tree.getroot()
        triples_num = int(triple_re.findall(file)[0])
        for sub_root in root:
            for ss_root in sub_root:
                structured_master = []
                unstructured = []
                for entry in ss_root:
                    unstructured.append(entry.text)
                    structured = [triple.text for triple in entry]
                    structured_master.extend(structured)
                unstructured = [i for i in unstructured if i.replace('\n', '').strip() != '']
                structured_master = structured_master[-triples_num:]
                structured_master_str = (' && ').join(structured_master)
                data_dct[structured_master_str] = unstructured
    return data_dct

def create_dataframe(data_dct):
    mdata_dct = {"prefix": [], "input_text": [], "target_text": []}
    for st, unst in data_dct.items():
        for i in unst:
            mdata_dct['prefix'].append('webNLG')
            mdata_dct['input_text'].append(st)
            mdata_dct['target_text'].append(i)
    df = pd.DataFrame(mdata_dct)
    return df

def progress(loss, value, max=100):
    return HTML(""" Batch loss :{loss}
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(loss=loss, value=value, max=max))

def train_model(train_df, model, tokenizer, optimizer, dev, num_of_epochs=8, batch_size=8):
    num_of_batches = len(train_df) / batch_size
    num_of_batches = int(num_of_batches)
    model.train()
    loss_per_10_steps = []
    

    for epoch in range(1, num_of_epochs + 1):
        print(f'Running epoch: {epoch}')
        running_loss = 0
        out = display(progress(1, num_of_batches + 1), display_id=True)

        for i in range(num_of_batches):
            inputbatch, labelbatch = [], []
            new_df = train_df[i * batch_size:i * batch_size + batch_size]
            for _, row in new_df.iterrows():
                input = 'WebNLG: ' + row['input_text'] + '</s>'
                labels = row['target_text'] + '</s>'
                inputbatch.append(input)
                labelbatch.append(labels)

            inputbatch = tokenizer.batch_encode_plus(inputbatch, padding=True, max_length=400, return_tensors='pt')["input_ids"]
            labelbatch = tokenizer.batch_encode_plus(labelbatch, padding=True, max_length=400, return_tensors="pt")["input_ids"]
            inputbatch, labelbatch = inputbatch.to(dev), labelbatch.to(dev)

            optimizer.zero_grad()
            outputs = model(input_ids=inputbatch, labels=labelbatch)
            loss = outputs.loss
            loss_num = loss.item()
            logits = outputs.logits
            running_loss += loss_num
            if i % 10 == 0:
                loss_per_10_steps.append(loss_num)
            out.update(progress(loss_num, i, num_of_batches + 1))

            loss.backward()
            optimizer.step()

        running_loss = running_loss / int(num_of_batches)
        print(f'Epoch: {epoch} , Running loss: {running_loss}')

def save_model(model, tokenizer, output_dir="trained_model"):
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)


In [4]:
#################################
# Data Prep
#################################

url = 'https://gitlab.com/shimorina/webnlg-dataset/-/archive/master/webnlg-dataset-master.zip?path=release_v3.0/en/train'
download_and_extract_dataset(url, 'web.zip', 'web')
files = glob.glob("web/webnlg-dataset-master-release_v3.0-en-train/release_v3.0/en/train/**/*.xml", recursive=True)

data_dct = process_dataset(files)
df = create_dataframe(data_dct)
df.to_csv('webNLG2020_train.csv')
train_df = pd.read_csv('webNLG2020_train.csv', index_col=[0])
train_df = train_df.iloc[:35000, :]  #hack
train_df = train_df.sample(frac=1)


In [5]:
#################################
# Training
#################################

dev = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = T5Tokenizer.from_pretrained('t5-base', model_max_length=512)
model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True)
model.to(dev)

optimizer = Adafactor(
    model.parameters(),
    lr=1e-3,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False,
)

train_model(train_df, model, tokenizer, optimizer, dev)
save_model(model, tokenizer, "trained_model")
shutil.make_archive("trained_model", 'zip', "trained_model")



Running epoch: 1




KeyboardInterrupt: 