In [69]:
IN_COLAB = 'google.colab' in str(get_ipython())
TRAIN = True
LOAD_MODEL = None
TRAIN_TOKENIZER = True

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    %cd /content/drive/MyDrive/Github/Abstract-generator/bumbleBERT/notebooks
    batch_size = 10 #32
    eval_batch_size = 10 #32
    epochs = 10  # The number of epochs
    !pip install feedparser tokenizers transformers
else:
    batch_size = 32       # 3
    val_batch_size = 32  # 3
    epochs = 10  # The number of epochs

print('Running in colab: ', IN_COLAB)
print('Training mode: ', TRAIN)
if LOAD_MODEL is not None:
    print('Using previous model: ', LOAD_MODEL)

Running in colab:  False
Training mode:  True


In [70]:
import os, torch, time, math, sys, re, csv
import numpy as np

sys.path.append('..' + os.sep )
from src import default

from src.data import download as dl, tokenization as tkn, custom_dataset as cd

from torch.utils.data import DataLoader
from src.model.transformer_hf import TransformerModel
from src.model.batching import CustomBatch
from src.model.generate_text import gen_some_text
#from src.model.train_evaluate import train, evaluate
#from src.model.transformer import make_gpt_model # imports don't work

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Parameters

In [80]:
# ARCHITECTURE
if LOAD_MODEL is None:
    params = {
                'maxLen'     : 10, # maximum sentence length
                'vocabSize'    : None, # None if you want to let tokenizer do its thing
                'emsize'     : 512, # embedding dimension
                'nhid'       : 2048, # the dimension of the feedforward network model in torch.nn.TransformerEncoder
                'nlayers'    : 12, # the number of torch.nn.TransformerEncoderLayer in torch.nn.TransformerEncoder
                'nhead'      : 8, # the number of heads in the multiheadattention models
                'dropout'    : 0.1 # the dropout value
            }

    # TOKENIZER
    tknzerType = 'BPE' # type of tokenizing algorithm
    
else:
    print("TODO")

### Download Dataset

In [81]:
# download data
nbrResults = 10**3 # number of data samples to download
filename = f'arxiv_{nbrResults}'
extension = '.csv'
filepath = default.RAW_DATA_DIR + os.sep + filename + extension

if not os.path.exists(filepath):
    dl.arxiv_api( default.RAW_DATA_DIR, filepath, max_results=nbrResults ) # TODO : CHANGE SO THAT NOT CONSTANTLY LOADING DATA
print(f'>> Using {filename} for training <<')

>> Using arxiv_1000 for training <<


### Format Dataset

Uses a custom dataset class, which is an iterable and callable structure that returns a sample from our dataset. Within this custom dataset, can determine all preprocessing.

In [82]:
# create dataset
dataset = cd.ArxivDataset(filepath)

### Training Tokenizer

Training of a custom tokenizer. Many options possible here, check the tokenizer training functions to try out various strategies. If he tokenizer for the dataset has already been trained, no need to run this again.

In [74]:
if ( (LOAD_MODEL is None) and TRAIN_TOKENIZER):
    trainTokenizer: _ = tkn.train_custom_tokenizer(tknzerType, 
                                                   dataset, 
                                                   filename,
                                                   default.TOK_DIR,
                                                   params['vocabSize'],
                                                   **default.special_token_lst
                                                  )



### Loading Tokenizer and Splitting Datasets

For some reason, torch tokenizers are not callable as trained. This is confusing, but c'est la vie! Instead, need to load it from file it was saved in using the PreTrainedTokenizerFast class (__call__) implemented in here. Once that's done, you can add this tokenizer as a transform to your dataset! Useful.

We also split the dataset here into training, testing and validation datasets.

In [83]:
tknzrFile = default.TOK_DIR + os.sep + filename + '_' + tknzerType + '.json'

# load PreTrainedTokenizerFast, for __call__. __call__ not implemented in
# the base Tokenizer class... that sounds silly, but it is what it is
tknzr = tkn.load_tokenizer(tknzrFile, **default.special_token_lst)

if params['vocabSize'] is None: params['vocabSize'] = tknzr.vocab_size

# set tknzr as the transform
dataset.set_transform( tknzr )

# separate dataset into train, test valid TODO : make into a function
fracTrain, fracTest, fracVal = ( 0.7, 0.2, 0.1)
trainTestVal = [ np.floor(fracTrain*len(dataset))\
                    , np.floor(fracTest*len(dataset))\
                    , len(dataset) - ( np.floor( fracTrain*len(dataset) ) +
                    np.floor( fracTest*len(dataset) ) )
                    ]

trainDataset, testDataset, valDataset =\
        torch.utils.data.random_split(dataset,
                                      [int(x) for x in trainTestVal],
                                      generator=torch.Generator().manual_seed(42) 
                                     )


### Creating DataLoaders

Training is done on batches, so we need a way to extract groupings of the data in the appropriate format for our transformer model.
Note that for transformers which we are training, dataloaders outputs both src (x[:-1] and tgt ([1:]).
The collation of batches for different transformer models we have vary. For HuggingFace it's ( maxLen x batch_size ) whereas I think that the Annotated Transformer has ( batch_size x maxLen ).

NOTE : Do not use the tokenizer before the training if you use num_workers>0!
FastTokenizer does not play nicely with forking if you use it before the forking of your data:
https://stackoverflow.com/questions/62691279/how-to-disable-tokenizers-parallelism-true-false-warning

In [84]:
import math
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class CustomBatch():
    """
    a variant of collate_fn that pads according to the longest sequence in
    a batch of sequences or maxLen. Unclear to me whether this is instantiated
    at every call to dataloader or if it's instantiated along with dataloader.
    For now, all the tensors exist on CPU and are later pushed to the GPU. Needs
    potentially to be changed.
    """

    def __init__(self, data, dim=0, maxLenModel=100, padValue=0, stackDim=1):
        """
        Input:
            data (dataset)      : a batch of dataset.
            dim (int)           : the dimension to be padded (dimension of time in sequences)
            maxLenModel (int)   : maixmum length of sentence
            padValue (int)      : the value for padding.
            stackDim (int)      : dimension along which to stack the data in tensor.
                                    1 for huggingface, 0 for annotated transformer
        """
        self.dim = dim; self.padValue = padValue

        max_len_seq = np.max( [ x.shape[self.dim] for x in data ] )
        self.maxLen = np.min( [max_len_seq, maxLenModel] )
        
        # pad according to max_len
        batch = [self.pad_tensor(x[:self.maxLen]) for x in data ]
        #pad_mask = [  ]
        # stack all, change to dim = 0 for annotated transformer?
        self.src = (torch.stack([x[:-1] for x in batch], dim=stackDim)).long()
        self.tgt = (torch.stack([x[1:] for x in batch], dim=stackDim)).long()
        #self.pad_mask = (torch.stack([x[1:] for x in batch], dim=stackDim)).long()
        #ys = torch.LongTensor(map(lambda x: x[1], batch))

    def pad_tensor(self, vec):
        """
        padding a tensor which represents a batch

        Input:
            vec : tensor to pad

        Output:
            a new tensor padded to 'pad' in dimension 'dim'
        """
        padSize = list(vec.shape)
        padSize[self.dim] = self.maxLen - vec.size(self.dim)
        return torch.cat([vec, self.padValue*torch.ones(*padSize)], dim=self.dim)

    def pin_memory(self):
        self.src = self.src.pin_memory()
        self.tgt = self.tgt.pin_memory()
        return self


In [85]:
# create dataloaders
# uses collate function to transform batch to correct dimensions
def collate_wrapper(batch):
    return CustomBatch(batch,
                       dim = 0,
                       maxLenModel = params['maxLen'],
                       padValue = tknzr.get_vocab()["<pad>"]
                      )

# dataloader for training
trainDataLoader = DataLoader(trainDataset,
                             batch_size = batch_size,
                             shuffle = True,
                             num_workers = 2,
                             collate_fn = collate_wrapper,
                             pin_memory = True
                             )
# dataloader for validation
valDataLoader = DataLoader(valDataset, 
                           batch_size = val_batch_size,
                           shuffle = True,
                           num_workers = 2,
                           collate_fn = collate_wrapper,
                           pin_memory = True,
                          )

### Selecting model

Here we choose which model we shall use for training. For now, I've selected the black box Transformer from HuggingFace because the collate_fn I've written gives the correct input size force it... however this can easily be changed! 

In [87]:
# transformer from huggingface
# TODO : Change to the Annotated Transformer if I want
model = TransformerModel(params['vocabSize'],
                         params['emsize'], 
                         params['nhead'], 
                         params['nhid'], 
                         params['nlayers'], 
                         params['dropout']
                        ).to(device)

# criterion
criterion = torch.nn.CrossEntropyLoss()

# optimizer
# learning rate Matt used with Adam is 0.5
paramsAdam  = [{'params' : model.parameters(),
                'lr'     : 0.5, 'betas' : (0.9, 0.999),
                'eps'    : 1e-08, 'weight_decay' : 0.0
               }
              ]
paramsAdamW = [{'params' : model.parameters(),
                'lr'     : 5e-5,
                'betas'  : (0.9, 0.999),
                'eps'    : 1e-08,
                'weight_decay' : 0.0
               }
              ]
paramsSGD   = [{'params'    : model.parameters(),
                'lr'        : 0.5,
                'momentum'  : 0.0,
                'dampening' : 0.0,
                'weight_decay' : 0.0
               }
              ]

optimizer = torch.optim.SGD( paramsSGD )
#optimizer = torch.optim.Adam( paramsAdam )
#optimizer = torch.optim.AdamW( paramsAdamW )

# scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

### Training

Training loop!

In [89]:
def train( model, maxLen, dataLoader, device, vocabSize, epoch, optimizer_, scheduler_, criterion_ ):
    """
    Training loop that takes batches from dataLoader and pushes them to device
    to train. Will check if they're the same size of maxLen: if shorter, will
    reduces to longest length in batch. then trains according to optimizer,
    criterion and schedule.

    Input
        model (instance)        : model that is being trained
        maxLen (int)            : maximum sentence length
        dataLoader (instance)   : dataloader that batches data into tensors
        optimizer (instance)    : Not sure what type optimizers are
        criterion               :
        device (str)            : gpu or cpu
    Output
        None
    """

    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(maxLen).to(device)
    nbr_batches = len(dataLoader)
    for i, batch in enumerate(dataLoader):
        src = (batch.src).to(device); tgt = (batch.tgt).to(device)

        optimizer_.zero_grad()
        if src.size(0) != maxLen:
            src_mask = model.generate_square_subsequent_mask(src.size(0)).to(device)
        
        output = model(src, src_mask)
        loss = criterion_(output.view(-1, vocabSize), tgt.reshape(-1))
        loss.backward()
        torch.torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer_.step()

        total_loss += loss.item()
        log_interval = 5
        if i % log_interval == 0 and i > 0:
            cur_loss = total_loss / log_interval
            elapsed = ( time.time() - start_time ) * 1000 / log_interval
            loss_exp = math.exp(cur_loss)
            last_lr = scheduler_.get_last_lr()[0]
            print(f'| epoch {epoch:3d} | {i:5d}/{nbr_batches:5d} batches | lr {last_lr:02.2f}'
            + f'| ms/batch {elapsed:5.2f} | loss {cur_loss:5.2f} | ppl {loss_exp:8.2f}'
                 )
            total_loss = 0
            start_time = time.time()


# evaluation function outside of training - same as hugging face
def evaluate(eval_model, maxLen, dataLoader, nbrSamples, device, vocabSize, criterion_):
    """
    Takes a trained model, puts it in evaluation mode to see how well it
    performs on another set of data.

    Input
        eval_model (instance)   : model to be evaluated
        maxLen (int)            : maximum length possible/trained on
        dataLoader (instance)   : dataloader of the dataset that is evaluate on
        nbrSamples (int)        : Supposed to be number of samples, not sure I need
    Output
        loss of evaluated set
    """
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = eval_model.generate_square_subsequent_mask(maxLen).to(device)
    with torch.no_grad():
        for batch in dataLoader:
            src = (batch.src).to(device); tgt = (batch.tgt).to(device)
            if src.size(0) != maxLen:
                src_mask = eval_model.generate_square_subsequent_mask(
                                                    src.size(0)).to(device)
            output = eval_model(src, src_mask)
            output_flat = output.view(-1, vocabSize)
            #total_loss += len(src) * criterion_(output_flat
            total_loss += criterion_(output_flat, tgt.reshape(-1)).item()
    return total_loss / (nbrSamples - 1) # nbrSamples -x-> len(dataLoader)

def save_model(model, tknzrFile, modelParams, tknzrParams, full=True):
    return 0

In [88]:
for i, batch in enumerate(trainDataLoader):
    if i < 1:
        print(batch.src)
        print(batch.tgt)

tensor([[   1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [3731, 4652, 2412, 1717,  749,  289, 1691, 2054,  988, 4764, 2799, 4988,
          802,  611,  122, 5378, 2097,  623, 2533,  897, 1357,  122, 2109,  122,
         3352,  122, 3603, 1509,  122, 4897,  658, 2795],
        [ 134, 1060,  658,  844,  116,  988,  122,  557,  116,  134,  116, 4429,
         3958,  116,  107,  139,  116,  775,  122, 1435, 2886,  107,  116,  107,
          199,  107, 2712, 1619,  456, 1397,  471, 4170],
        [ 429, 1364,  139, 1546, 1047, 3952,  107,  678,  647,  122,  145,    2,
          139, 1706,  641, 4076,  646,  107, 1263,  107, 4492,  111,  145,  262,
          164,  262,  116,  116,  139,  134,  110, 4165],
        [ 107,  425,  315, 1500, 2927, 5415, 1996,  313,  407, 2354, 1936,    0,
         5213,  201, 1862, 5362,  139, 

In [None]:
if TRAIN:
    best_val_loss = float("inf")
    best_model = None
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model, 
              params['maxLen'],
              trainDataLoader,
              device,
              params['vocabSize'],
              epoch,
              optimizer,
              scheduler,
              criterion
             )
        val_loss = evaluate(model,
                            params['maxLen'],
                            valDataLoader,
                            len(valDataset),
                            device,
                            params['vocabSize'],
                            criterion
                           )
        print('-' * 89)
        timing = (time.time() - epoch_start_time)
        exp_val = math.exp(val_loss)
        print(f'| end of epoch {epoch:3d} | time: {timing:5.2f}s | valid loss {val_loss:5.2f} | valid ppl {exp_val:8.2f}')
                                         # Why is math.exp so large????
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model

        scheduler.step()


    # save best model (two methods)
    modelFull = default.MODEL_DIR + os.sep + f'{filename}_epoch{epochs}.pth'
    modelWeights = default.MODEL_DIR + os.sep + f'{filename}_weights_epoch{epochs}.pth'
    # approach 1: save model (class) entirely (uses pickle)
    torch.save(model, modelFull)
    # approach 2: save model weights
    torch.save(model.state_dict(), modelWeights)

9
9
9
9
9
9
| epoch   1 |     5/   22 batches | lr 0.50| ms/batch 480.48 | loss 10.06 | ppl 23431.06
9
9
9
9
9
| epoch   1 |    10/   22 batches | lr 0.50| ms/batch 341.34 | loss  7.70 | ppl  2207.77
9
9
9
9
9
| epoch   1 |    15/   22 batches | lr 0.50| ms/batch 347.03 | loss  7.29 | ppl  1459.78
9
9
9
9
9
| epoch   1 |    20/   22 batches | lr 0.50| ms/batch 344.69 | loss  7.14 | ppl  1260.26
9
-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  8.51s | valid loss  0.29 | valid ppl     1.33
-----------------------------------------------------------------------------------------
9
9
9
9
9
9
| epoch   2 |     5/   22 batches | lr 0.47| ms/batch 496.05 | loss  8.27 | ppl  3902.16
9
9
9
9
9
| epoch   2 |    10/   22 batches | lr 0.47| ms/batch 361.53 | loss  6.93 | ppl  1018.27
9
9
9
9
9
| epoch   2 |    15/   22 batches | lr 0.47| ms/batch 340.61 | loss  7.04 | ppl  1136.56
9
9
9
9
9
| epoch   2 |    20/   22 batches | l

### Text Generation

Here I've simply taken the code Matt uses to generate text.

In [46]:
customFilename = 'arxiv_1000'
customEpochs = 10
modelFullPath = default.MODEL_DIR + os.sep + f'{customFilename}_epoch{customEpochs}.pth'
modelWeightsPath = default.MODEL_DIR + os.sep + f'{customFilename}_weights_epoch{customEpochs}.pth'

tknzrFile = default.TOK_DIR + os.sep + customFilename + '_' + tknzerType + '.json'

# load PreTrainedTokenizerFast, for __call__. __call__ not implemented in
# the base Tokenizer class... that sounds silly, but it is what it is
tknzr = tkn.load_tokenizer(tknzrFile, **default.special_token_lst)

if params['vocabSize'] is None: params['vocabSize'] = tknzr.vocab_size

# approach 1: load model (class) entirely (uses pickle)
modelFullLoad = torch.load(modelFullPath, map_location=device)

# approach 2: load model weights, need to have some parameter or something 
#modelLoad = TransformerModel(vocabSize, emsize, nhead, nhid, nlayers, dropout).to(device)
#modelWeightsLoad = modelLoad.load_state_dict( torch.load(modelWeightsPath) )

model = modelFullLoad

In [54]:
# inspect both models
#print('model_A info...\n', modelFullLoad)
#print('\nmodel_B info...\n', modelWeightsLoad)

#print('model_A == model_B:', modelFullLoad == modelWeightsLoad)
#model = modelFullLoad
# Text generation example
prompt = 'Electron'
ngen = 35
decode_style = 'greedy'
model.to('cpu')
generated_text = gen_some_text(model, tknzr, 'cpu', params['maxLen'], text_prompt=prompt, tokens_to_gen=ngen, vis=False,
    decode_style=decode_style)
print("Text prompt:\n", prompt)
print("Number of tokens to generate:", ngen)
print("Generated_text:\n", generated_text)

# TODO: alternative generation
# currently 'greedy method'
# see: https://huggingface.co/blog/how-to-generate

<s>  Electron
tensor([ 146.7562,  -16.5392, -102.9582,  ...,  -18.1525,   25.8854,
          -7.8421], grad_fn=<SelectBackward>)
Electron <pad>
<s>  Electron <pad>
tensor([ 146.7562,  -16.5392, -102.9582,  ...,  -18.1525,   25.8854,
          -7.8421], grad_fn=<SelectBackward>)
Electron <pad> <pad>
<s>  Electron <pad> <pad>
tensor([ 146.7562,  -16.5392, -102.9582,  ...,  -18.1525,   25.8854,
          -7.8421], grad_fn=<SelectBackward>)
Electron <pad> <pad> <pad>
<s>  Electron <pad> <pad> <pad>
tensor([ 146.7562,  -16.5392, -102.9582,  ...,  -18.1525,   25.8854,
          -7.8421], grad_fn=<SelectBackward>)
Electron <pad> <pad> <pad> <pad>
<s>  Electron <pad> <pad> <pad> <pad>
tensor([ 146.7562,  -16.5392, -102.9582,  ...,  -18.1525,   25.8854,
          -7.8421], grad_fn=<SelectBackward>)
Electron <pad> <pad> <pad> <pad> <pad>
<s>  Electron <pad> <pad> <pad> <pad> <pad>
tensor([ 146.7562,  -16.5392, -102.9582,  ...,  -18.1525,   25.8854,
          -7.8421], grad_fn=<SelectBackward>)
E

tensor([ 146.7562,  -16.5392, -102.9582,  ...,  -18.1525,   25.8854,
          -7.8421], grad_fn=<SelectBackward>)
Electron <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Text prompt:
 Electron
Number of tokens to generate: 35
Generated_text:
 Electron <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
