In [1]:
import os, torch, time, math, sys, re, csv
import numpy as np

sys.path.append('..' + os.sep )
from src import default

from src.data import download as dl, data_preprocessing as dpp, tokenization as tkn\
                        , custom_dataset as cd
from torch.utils.data import DataLoader
from src.model.transformer_hf import TransformerModel, PadCollate
#from src.model.transformer import make_gpt_model # imports don't work

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Parameters

In [2]:
maxLen     = 250 # maximum sentence length
bsz        = 3 # batch size
vocabSize  = None # None if you want to let tokenizer do its thing
emsize     = 200 # embedding dimension
nhid       = 200 # the dimension of the feedforward network model in torch.nn.TransformerEncoder
nlayers    = 2 # the number of torch.nn.TransformerEncoderLayer in torch.nn.TransformerEncoder
nhead      = 2 # the number of heads in the multiheadattention models
dropout    = 0.2 # the dropout value
tknzerType = 'BPE' # type of tokenizing algorithm
trainTokenizer = False # whether to train a new tokenizer or use one already trained
download   = False # haven't implemented yet, whether to download
nbrResults = 10 # number of data samples to download
epochs = 3 # The number of epochs

### Download Dataset

In [3]:
# download data
filename = dl.arxiv_api( default.RAW_DATA_DIR, max_results=nbrResults )
print(f'>> Using {filename} for training <<')
fnameStrip = filename[:-4] # remove .csv

>> Using arxiv_10.csv for training <<


### Format Dataset

Uses a custom dataset class, which is an iterable and callable structure that returns a sample from our dataset. Within this custom dataset, can determine all preprocessing.

In [4]:
# create dataset
dataset = cd.ArxivDataset(default.RAW_DATA_DIR + os.sep + filename, maxLen, device)

### Training Tokenizer

Training of a custom tokenizer. Many options possible here, check the tokenizer training functions to try out various strategies. If he tokenizer for the dataset has already been trained, no need to run this again.

In [5]:
_ = tkn.train_custom_tokenizer(tknzerType, dataset, fnameStrip
                                            , default.TOK_DIR
                                            , vocabSize
                                            , **default.special_token_lst)



### Loading Tokenizer and Splitting Datasets

For some reason, torch tokenizers are not callable as trained. This is confusing, but c'est la vie! Instead, need to load it from file it was saved in using the PreTrainedTokenizerFast class (__call__) implemented in here. Once that's done, you can add this tokenizer as a transform to your dataset! Useful.

We also split the dataset here into training, testing and validation datasets.

In [6]:
tknzrFile = default.TOK_DIR + os.sep + fnameStrip + '_' + tknzerType + '.json'

# load PreTrainedTokenizerFast, for __call__. __call__ not implemented in
# the base Tokenizer class... that sounds silly, but it is what it is
tknzr = tkn.load_tokenizer(tknzrFile, **default.special_token_lst)

if vocabSize is None: vocabSize = tknzr.vocab_size

# set tknzr as the transform
dataset.set_transform( tknzr )

# separate dataset into train, test valid TODO : make into a function
fracTrain, fracTest, fracVal = ( 0.7, 0.2, 0.1)
trainTestVal = [ np.floor(fracTrain*len(dataset))\
                    , np.floor(fracTest*len(dataset))\
                    , len(dataset) - ( np.floor( fracTrain*len(dataset) ) +
                    np.floor( fracTest*len(dataset) ) )
                    ]

trainDataset, testDataset, valDataset =\
        torch.utils.data.random_split(dataset, [int(x) for x in trainTestVal]
                                , generator=torch.Generator().manual_seed(42) )

AttributeError: 'Series' object has no attribute 'type'

### Creating DataLoaders

Training is done on batches, so we need a way to extract groupings of the data in the appropriate format for our transformer model.
Note that for transformers which we are training, dataloaders outputs both src (x[:-1] and tgt ([1:]).
The collation of batches for different transformer models we have vary. For HuggingFace it's ( maxLen x batch_size ) whereas I think that the Annotated Transformer has ( batch_size x maxLen ).

In [None]:
# create dataloaders
# uses collate function to transform batch to correct dimensions
trainDataLoader = DataLoader(trainDataset, batch_size=bsz, shuffle=True
                                        , collate_fn = PadCollate(dim=0,
                                            maxLen=maxLen,
                                            padValue=tknzr.get_vocab()["<pad>"])
                                        )
valDataLoader = DataLoader(valDataset, batch_size=bsz, shuffle=True
                                        , collate_fn = PadCollate(dim=0,
                                            maxLen=maxLen,
                                            padValue=tknzr.get_vocab()["<pad>"])
                                        )

### Training and Evaluation Functions

Training and evaluation are pretty straightforward.

***Note*** : I'm not too sure what ppl is... and why it is so large!

In [None]:
# training function - same as in hugging face
def train( model, maxLen, dataLoader, nbrSamples, optimizer_, scheduler_
                , criterion_, device_ ):

    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(maxLen).to(device_)
    for i, batch in enumerate(dataLoader):
        data = batch[0]; targets = batch[1]
        optimizer_.zero_grad()
        if data.size(0) != maxLen:
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)

        output = model(data, src_mask)
        loss = criterion_(output.view(-1, vocabSize), targets.reshape(-1))
        loss.backward()
        torch.torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer_.step()

        total_loss += loss.item()
        log_interval = 200
        if i % log_interval == 0 and i > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, i, nbrSamples // maxLen,
                            scheduler.get_last_lr()[0],
                            elapsed * 1000 / log_interval,
                            cur_loss, math.exp(cur_loss)))
            # 200 / 2 batches... wrong, why?
            total_loss = 0
            start_time = time.time()

# evaluation function outside of training - same as hugging face
def evaluate(eval_model, maxLen, dataLoader, nbrSamples):

    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = model.generate_square_subsequent_mask(maxLen).to(device)
    with torch.no_grad():
        for batch in dataLoader:
            data = batch[0]; targets = batch[1]
            if data.size(0) != maxLen:
                src_mask = model.generate_square_subsequent_mask(
                                                    data.size(0)).to(device)
            output = eval_model(data, src_mask)
            output_flat = output.view(-1, vocabSize)
            total_loss += len(data) * criterion(output_flat
                                                , targets.reshape(-1)).item()
    return total_loss / (nbrSamples - 1)

### Selecting model

Here we choose which model we shall use for training. For now, I've selected the black box Transformer from HuggingFace because the collate_fn I've written gives the correct input size force it... however this can easily be changed! 

In [None]:
# transformer from huggingface
model = TransformerModel(vocabSize, emsize, nhead, nhid, nlayers, dropout).to(device)

# transformer from illustrated transformer
#model = make_gpt_model(vocabSize, vocabSize, nlayers, emsize, nhid, nhead, dropout)

criterion = torch.nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

### Training

Training loop!

In [None]:
best_val_loss = float("inf")
best_model = None
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train( model, maxLen, trainDataLoader, len(trainDataset), optimizer
                , scheduler, criterion, device)
    val_loss = evaluate(model, maxLen, valDataLoader, len(valDataset))
    print('-' * 89)
    print(val_loss)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
                                     # Why is math.exp so large????
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

    
# save best model (two methods)
# approach 1: save model (class) entirely (uses pickle)
torch.save(model, default.MODEL_DIR + os.sep + f'{fnameStrip}_epoch{epochs}.pth')
# approach 2: save model weights
torch.save(model.state_dict(), default.MODEL_DIR + os.sep + f'{fnameStrip}_weights_epoch{epochs}.pth')

| epoch   1 |   200/    2 batches | lr 3.68 | ms/batch 77.35 | loss  3.09 | ppl    22.02
-----------------------------------------------------------------------------------------
251.84395729411733
| end of epoch   1 | time: 18.72s | valid loss 251.84 | valid ppl 23683231137256318917393889330817365423104278032154741069164147614539982561908324822727113530917925435736064000.00
-----------------------------------------------------------------------------------------
| epoch   2 |   200/    2 batches | lr 3.49 | ms/batch 77.41 | loss  3.01 | ppl    20.20
-----------------------------------------------------------------------------------------
252.6922374734975
| end of epoch   2 | time: 18.79s | valid loss 252.69 | valid ppl 55315183129112076009966142380257140036137209539330504659969766362428610037529899011090335910894652491028234240.00
-----------------------------------------------------------------------------------------
| epoch   3 |   200/    2 batches | lr 3.32 | ms/batch 124.12 | l