# Model Training
* This notebook will be used for conducting training of the model. Things may not mesh 100% here yet

In [1]:
import torch
from torchtext.data.utils import get_tokenizer
import sys
import time

from torch.utils.tensorboard import SummaryWriter

import pandas as pd
import numpy as np

from transformer import Transformer # our "custom" transformer implementation

sys.path.append("datasets/alpaca")
from alpaca_dataset import Alpaca_Dataset # custom dataset class

In [2]:
# use nvidia-smi to find open GPUs
device = torch.device(7)

### Get Data
* Vocab and datasets prepared in [data_preparation.ipynb](data_preparation.ipynb)

In [4]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
vocab = torch.load('vocabs/alpaca/vocab.pth')
vocab_token_dict = vocab.get_stoi()
pad_token = vocab_token_dict['<pad>']
unknown_token = vocab_token_dict['<unk>']
text_pipeline = lambda x: vocab(tokenizer(x))

# train_dataset = torch.load("datasets/alpaca/train_dataset.pth")
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)

# test_dataset = torch.load("datasets/alpaca/test_dataset.pth")
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=True)

dataset = torch.load("datasets/alpaca/train_dataset.pth")
loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)

### Masking Function
* This function ensures that every word in the decoder can only be influenced by tokens from the encoder and tokens that come before it in the expected target

In [5]:
def create_mask(source, target):
    source_mask = ((source != pad_token) & (source != unknown_token)).unsqueeze(1)
    
    target_mask = ((target != pad_token) & (target != unknown_token)).unsqueeze(1)
    size = target.size(1) # get seq_len for matrix
    nopeak_mask = np.triu(np.ones([1, size, size]), k=1).astype('uint8')
    nopeak_mask = torch.autograd.Variable(torch.from_numpy(nopeak_mask) == 0)
    target_mask = target_mask & nopeak_mask
    
    return source_mask, target_mask

### Model Parameters

In [6]:
d_model = 512
heads = 8
N = 6
src_vocab = len(vocab)
model = Transformer(len(vocab), len(vocab), d_model, N, heads).to(device)
offset = 440
model.load_state_dict(torch.load(f"models/alpaca_train_{offset}_epoch.pt"))
# for p in model.parameters():
#     if p.dim() > 1:
#         torch.nn.init.xavier_uniform_(p)

# this code is very important! It initialises the parameters with a
# range of values that stops the signal fading or getting too big.
# See https://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization for a mathematical explanation.
optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

AssertionError: Torch not compiled with CUDA enabled

### Model Training
* This section is where the majority of current work lies. This training block was designed for a translation task, but we are looking to do next-word prediction. The source and target likely need to be something where the source is the text sequence so far and the target is the next word

In [7]:
def train_model(epochs, print_every=100):
    
    train_writer = SummaryWriter(log_dir="runs/exp3")
    
    model.train()
    
    start = time.time()
    temp = start
    
    total_loss = 0
    
    for epoch in range(epochs):
            
        for idx, (question, answer) in enumerate(loader): 
            src = question
            trg = answer

            # the answer we input has all words except
            # the last, as it is using each word to predict the next
            trg_input = trg[:, :-1]

            # the words we are trying to predict
            targets = trg[:, 1:].contiguous().view(-1).to(device)

            # create function to make masks using mask code above
            src_mask, trg_mask = create_mask(src, trg_input)

            src, trg_input, src_mask, trg_mask = src.to(device), trg_input.to(device), src_mask.to(device), trg_mask.to(device)

            preds = model(src, trg_input, src_mask, trg_mask)

            optim.zero_grad()

            loss = torch.nn.functional.cross_entropy(preds.view(-1, preds.size(-1)), targets, ignore_index=pad_token)
            loss.backward()
            optim.step()

            total_loss += loss.item()
            if (idx + 1) % print_every == 0:
                loss_avg = total_loss / print_every
                train_writer.add_scalar('Loss/train', loss_avg, (epoch+(offset-400))*len(loader) + idx)
                print("time = %dm, epoch %d, iter = %d, loss = %.3f, %ds per %d iters" % ((time.time() - start) // 60,
                epoch + 1, idx + 1, loss_avg, time.time() - temp, print_every))
                total_loss = 0
                temp = time.time()
train_model(20)

NameError: name 'model' is not defined

In [None]:
# torch.save(model.state_dict(), "models/alpaca_train_460_epoch.pt")