In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

%cd /content/gdrive/MyDrive/Github/Abstract-generator/bumbleBERT/notebooks

Mounted at /content/gdrive
/content/gdrive/MyDrive/Github/Abstract-generator/bumbleBERT/notebooks


In [3]:
!pip install feedparser tokenizers transformers

Collecting feedparser
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/15bf6781a861bbc5dd801d467f26448fb322bfedcd30f2e62b148d104dfb/feedparser-6.0.8-py3-none-any.whl (81kB)
[K     |████████████████████████████████| 81kB 5.8MB/s 
[?25hCollecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 44.6MB/s 
[?25hCollecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 50.1MB/s 
[?25hCollecting sgmllib3k
  Downloading https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3

In [4]:
import os, torch, time, math, sys, re, csv
import numpy as np

sys.path.append('..' + os.sep )
from src import default

from src.data import download as dl, data_preprocessing as dpp, tokenization as tkn\
                        , custom_dataset as cd
from torch.utils.data import DataLoader
from src.model.transformer_hf import TransformerModel
from src.model.batching import CustomBatch
#from src.model.transformer import make_gpt_model # imports don't work

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Parameters

In [8]:
maxLen     = 100 # maximum sentence length
bsz        = 3 # batch size
vocabSize  = None # None if you want to let tokenizer do its thing
emsize     = 200 # embedding dimension
nhid       = 200 # the dimension of the feedforward network model in torch.nn.TransformerEncoder
nlayers    = 6 # the number of torch.nn.TransformerEncoderLayer in torch.nn.TransformerEncoder
nhead      = 8 # the number of heads in the multiheadattention models
dropout    = 0.2 # the dropout value
tknzerType = 'BPE' # type of tokenizing algorithm
trainTokenizer = True # whether to train a new tokenizer or use one already trained
download   = False # haven't implemented yet, whether to download
nbrResults = 10000 # number of data samples to download
epochs = 10 # The number of epochs

### Download Dataset

In [9]:
# download data
filename = dl.arxiv_api( default.RAW_DATA_DIR, max_results=nbrResults )
print(f'>> Using {filename} for training <<')
fnameStrip = filename[:-4] # remove .csv

>> Using arxiv_10000.csv for training <<


### Format Dataset

Uses a custom dataset class, which is an iterable and callable structure that returns a sample from our dataset. Within this custom dataset, can determine all preprocessing.

In [10]:
# create dataset
dataset = cd.ArxivDataset(default.RAW_DATA_DIR + os.sep + filename, maxLen)

### Training Tokenizer

Training of a custom tokenizer. Many options possible here, check the tokenizer training functions to try out various strategies. If he tokenizer for the dataset has already been trained, no need to run this again.

In [11]:
_ = tkn.train_custom_tokenizer(tknzerType, dataset, fnameStrip
                                            , default.TOK_DIR
                                            , vocabSize
                                            , **default.special_token_lst)



### Loading Tokenizer and Splitting Datasets

For some reason, torch tokenizers are not callable as trained. This is confusing, but c'est la vie! Instead, need to load it from file it was saved in using the PreTrainedTokenizerFast class (__call__) implemented in here. Once that's done, you can add this tokenizer as a transform to your dataset! Useful.

We also split the dataset here into training, testing and validation datasets.

In [12]:
tknzrFile = default.TOK_DIR + os.sep + fnameStrip + '_' + tknzerType + '.json'

# load PreTrainedTokenizerFast, for __call__. __call__ not implemented in
# the base Tokenizer class... that sounds silly, but it is what it is
tknzr = tkn.load_tokenizer(tknzrFile, **default.special_token_lst)

if vocabSize is None: vocabSize = tknzr.vocab_size

# set tknzr as the transform
dataset.set_transform( tknzr )

# separate dataset into train, test valid TODO : make into a function
fracTrain, fracTest, fracVal = ( 0.7, 0.2, 0.1)
trainTestVal = [ np.floor(fracTrain*len(dataset))\
                    , np.floor(fracTest*len(dataset))\
                    , len(dataset) - ( np.floor( fracTrain*len(dataset) ) +
                    np.floor( fracTest*len(dataset) ) )
                    ]

trainDataset, testDataset, valDataset =\
        torch.utils.data.random_split(dataset, [int(x) for x in trainTestVal]
                                , generator=torch.Generator().manual_seed(42) )

### Creating DataLoaders

Training is done on batches, so we need a way to extract groupings of the data in the appropriate format for our transformer model.
Note that for transformers which we are training, dataloaders outputs both src (x[:-1] and tgt ([1:]).
The collation of batches for different transformer models we have vary. For HuggingFace it's ( maxLen x batch_size ) whereas I think that the Annotated Transformer has ( batch_size x maxLen ).

NOTE : Do not use the tokenizer before the training if you use num_workers>0!
FastTokenizer does not play nicely with forking if you use it before the forking of your data:
https://stackoverflow.com/questions/62691279/how-to-disable-tokenizers-parallelism-true-false-warning

In [13]:
# create dataloaders
# uses collate function to transform batch to correct dimensions
def collate_wrapper(batch):
    return CustomBatch(batch, dim=0, maxLenModel=maxLen, padValue=tknzr.get_vocab()["<pad>"])

trainDataLoader = DataLoader(trainDataset, batch_size=bsz, shuffle=True
                                        , num_workers=2
                                        , collate_fn=collate_wrapper
                                        , pin_memory=True
                                        )
valDataLoader = DataLoader(valDataset, batch_size=bsz, shuffle=True
                                        , num_workers=2
                                        , collate_fn=collate_wrapper
                                        , pin_memory=True
                                        )

### Training and Evaluation Functions

Training and evaluation are pretty straightforward.

***Note*** : I'm not too sure what ppl is... and why it is so large!

In [14]:
# training function - same as in hugging face
def train( model, maxLen, dataLoader, nbrSamples, optimizer_, scheduler_
                , criterion_, device_ ):

    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(maxLen).to(device_)
    for i, batch in enumerate(dataLoader):
        #print((batch.src).is_pinned())
        src = (batch.src).to(device); tgt = (batch.tgt).to(device)

        optimizer_.zero_grad()
        if src.size(0) != maxLen:
            src_mask = model.generate_square_subsequent_mask(src.size(0)).to(device)

        output = model(src, src_mask)
        loss = criterion_(output.view(-1, vocabSize), tgt.reshape(-1))
        loss.backward()
        torch.torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer_.step()

        total_loss += loss.item()
        log_interval = 200
        if i % log_interval == 0 and i > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, i, len(dataLoader),
                            scheduler.get_last_lr()[0],
                            elapsed * 1000 / log_interval,
                            cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


# evaluation function outside of training - same as hugging face
def evaluate(eval_model, maxLen, dataLoader, nbrSamples):

    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = model.generate_square_subsequent_mask(maxLen).to(device)
    with torch.no_grad():
        for batch in dataLoader:
            src = (batch.src).to(device); tgt = (batch.tgt).to(device)
            if src.size(0) != maxLen:
                src_mask = model.generate_square_subsequent_mask(
                                                    src.size(0)).to(device)
            output = eval_model(src, src_mask)
            output_flat = output.view(-1, vocabSize)
            total_loss += len(src) * criterion(output_flat
                                                , tgt.reshape(-1)).item()
    return total_loss / (nbrSamples - 1)

### Selecting model

Here we choose which model we shall use for training. For now, I've selected the black box Transformer from HuggingFace because the collate_fn I've written gives the correct input size force it... however this can easily be changed! 

In [15]:
# transformer from huggingface
model = TransformerModel(vocabSize, emsize, nhead, nhid, nlayers, dropout).to(device)
#model = TransformerModel(vocabSize, emsize, 10, nhid, nlayers, dropout).to(device)

# transformer from illustrated transformer
#model = make_gpt_model(vocabSize, vocabSize, nlayers, emsize, nhid, nhead, dropout)

criterion = torch.nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

### Training

Training loop!

In [None]:
best_val_loss = float("inf")
best_model = None
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train( model, maxLen, trainDataLoader, len(trainDataset), optimizer
                , scheduler, criterion, device)
    val_loss = evaluate(model, maxLen, valDataLoader, len(valDataset))
    print('-' * 89)
    print(val_loss)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
                                     # Why is math.exp so large????
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

    
# save best model (two methods)
# approach 1: save model (class) entirely (uses pickle)
torch.save(model, default.MODEL_DIR + os.sep + f'{fnameStrip}_epoch{epochs}.pth')
# approach 2: save model weights
torch.save(model.state_dict(), default.MODEL_DIR + os.sep + f'{fnameStrip}_weights_epoch{epochs}.pth')

| epoch   1 |   200/ 2334 batches | lr 5.00 | ms/batch 23.49 | loss  8.64 | ppl  5636.96
| epoch   1 |   400/ 2334 batches | lr 5.00 | ms/batch 22.80 | loss  7.58 | ppl  1948.89
| epoch   1 |   600/ 2334 batches | lr 5.00 | ms/batch 21.65 | loss  7.30 | ppl  1486.86
| epoch   1 |   800/ 2334 batches | lr 5.00 | ms/batch 21.42 | loss  7.12 | ppl  1240.09
| epoch   1 |  1000/ 2334 batches | lr 5.00 | ms/batch 21.51 | loss  7.10 | ppl  1209.74
| epoch   1 |  1200/ 2334 batches | lr 5.00 | ms/batch 21.55 | loss  7.05 | ppl  1150.03
| epoch   1 |  1400/ 2334 batches | lr 5.00 | ms/batch 21.99 | loss  6.96 | ppl  1055.34
| epoch   1 |  1600/ 2334 batches | lr 5.00 | ms/batch 22.92 | loss  6.98 | ppl  1079.15
| epoch   1 |  1800/ 2334 batches | lr 5.00 | ms/batch 22.68 | loss  6.97 | ppl  1066.80
| epoch   1 |  2000/ 2334 batches | lr 5.00 | ms/batch 23.34 | loss  6.97 | ppl  1067.60
| epoch   1 |  2200/ 2334 batches | lr 5.00 | ms/batch 22.71 | loss  6.89 | ppl   978.92
---------------------