In [6]:
IN_COLAB = 'google.colab' in str(get_ipython())

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    %cd /content/drive/MyDrive/Github/Abstract-generator/bumbleBERT/notebooks
    
    %%capture
    !pip install feedparser tokenizers transformers;

In [7]:
import os, torch, time, math, sys, re, csv
import numpy as np

PACKAGE_ROOT = os.path.dirname(os.path.abspath(''))
print(PACKAGE_ROOT)
sys.path.append(PACKAGE_ROOT)


from src import settings
import src.data.dataset_class_alt as dsc
import src.data.dataloader_class_alt as dlc

from src.model.transformer_torch import TransformerModel
from src.model.generate_text import gen_some_text

from src.model.train_evaluate import train_version_jeremy as train
from src.model.train_evaluate import evaluate_version_jeremy as evaluate

#from src.model.transformer import make_gpt_model # imports don't work

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

/media/homes/msmart/Development/repos/Abstract-generator/local_ms


### Parameters

In [8]:
# ARCHITECTURE
maxLen     = 40 # maximum sentence length
vocabSize  = None # None if you want to let tokenizer do its thing
emsize     = 512 # embedding dimension
nhid       = 2048 # the dimension of the feedforward network model in torch.nn.TransformerEncoder
nlayers    = 12 # the number of torch.nn.TransformerEncoderLayer in torch.nn.TransformerEncoder
nhead      = 8 # the number of heads in the multiheadattention models
dropout    = 0.2 # the dropout value
batchSize = 10 #32
valBatchSize = 10 #32, not used right now.
epochs     = 10  # The number of epochs

TRAIN = True

### Format Dataset

Uses a custom dataset class, which is an iterable and callable structure that returns a sample from our dataset. Within this custom dataset, can determine all preprocessing.

In [9]:
# create dataset
#dataset = dsc.ArxivDataset()
dataset = dsc.WikiTextDataset()

#train tokenizer (or use one already trained)
tknzr_type = 'BPE'
flag_tknzr_train = True
flag_tknzr_fast = True

_ = dataset.tokenizer(flag_tknzr_train, tknzr_type, flag_tknzr_fast=flag_tknzr_fast)



### Creating DataLoaders

Training is done on batches, so we need a way to extract groupings of the data in the appropriate format for our transformer model.
Note that for transformers which we are training, dataloaders outputs both src (x[:-1] and tgt ([1:]).
The collation of batches for different transformer models we have vary. For HuggingFace it's ( maxLen x batch_size ) whereas I think that the Annotated Transformer has ( batch_size x maxLen ).

I created a custom Dataloader class that wraps splitting the dataset and also outputs different dataloaders for each.

NOTE : Do not use the tokenizer before the training if you use num_workers>0!
FastTokenizer does not play nicely with forking if you use it before the forking of your data:
https://stackoverflow.com/questions/62691279/how-to-disable-tokenizers-parallelism-true-false-warning

In [10]:
dataloader = dlc.CustomDataloader(dataset, batchSize, maxLen)

### Selecting model

Here we choose which model we shall use for training. For now, I've selected the black box Transformer from HuggingFace because the collate_fn I've written gives the correct input size force it... however this can easily be changed! 

In [11]:
# transformer from huggingface
# TODO : Change to the Annotated Transformer if I want
model = TransformerModel(dataset.vocab_size, emsize, nhead, nhid, nlayers, dropout).to(device)

# criterion
criterion = torch.nn.CrossEntropyLoss()#ignore_index=tknzr.get_vocab()["<pad>"])

# optimizer
paramsAdam  = [{'params' : model.parameters(), 'lr' : 1e-3, 'betas' : (0.9, 0.999), 'eps' : 1e-08, 'weight_decay' : 0.0}]
paramsAdamW = [{'params' : model.parameters(), 'lr' : 5e-5, 'betas' : (0.9, 0.999), 'eps' : 1e-08, 'weight_decay' : 0.0}]
paramsSGD   = [{'params' : model.parameters(), 'lr' : 0.5, 'momentum' : 0.0, 'dampening' : 0.0, 'weight_decay' : 0.0}]

#optimizer = torch.optim.SGD( paramsSGD )
#optimizer = torch.optim.Adam( paramsAdam )
optimizer = torch.optim.AdamW( paramsAdamW )

# scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) # 1.0 to signify no decay rate

### Training

Training loop!

In [13]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # fasttokenizer should not be used before forking. Something
                                                # to figure out. What this does is suppress some warning messages 
                                                # https://stackoverflow.com/questions/62691279/how-to-disable-tokenizers-parallelism-true-false-warning
                                                # doesn't seem to affect the timing though
if TRAIN:
    best_val_loss = float("inf")
    best_model = None
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model, dataloader.train, device, dataset.vocab_size, epoch, optimizer, scheduler, criterion, maxLen)
        val_loss = evaluate(model, dataloader.valid, device, dataset.vocab_size, criterion, maxLen, len(dataloader.dataset_valid))
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
              'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                         val_loss, math.exp(val_loss)))
                                         # Why is math.exp so large????
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model

        scheduler.step()

    # save best model (two methods)
    modelFull = default.MODEL_DIR + os.sep + f'{dataset.name}_epoch{epochs}.pth'
    modelWeights = default.MODEL_DIR + os.sep + f'{dataset.name}_weights_epoch{epochs}.pth'
    modelFullBest = default.MODEL_DIR + os.sep + f'{dataset.name}_epoch{epochs}_best.pth'
    modelWeightsBest = default.MODEL_DIR + os.sep + f'{dataset.name}_weights_epoch{epochs}_best.pth'
    # approach 1: save model (class) entirely (uses pickle)
    torch.save(model, modelFull)
    torch.save(best_model, modelFullBest)
    # approach 2: save model weights
    torch.save(best_model.state_dict(), modelWeightsBest)

| epoch   1 |   200/  486 batches | lr 0.00 | ms/batch 73.45 | loss  6.46 | ppl   636.57
| epoch   1 |   400/  486 batches | lr 0.00 | ms/batch 71.06 | loss  6.36 | ppl   579.88
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 36.44s | valid loss 24.93 | valid ppl 67422864729.52
-----------------------------------------------------------------------------------------
| epoch   2 |   200/  486 batches | lr 0.00 | ms/batch 74.08 | loss  6.21 | ppl   498.85
| epoch   2 |   400/  486 batches | lr 0.00 | ms/batch 72.82 | loss  6.17 | ppl   478.02
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 36.98s | valid loss 24.43 | valid ppl 40648889990.39
-----------------------------------------------------------------------------------------


KeyboardInterrupt: 

### Text Generation

Here I've simply taken the code Matt uses to generate text.

In [None]:
if not TRAIN:
    customFilename = 'arxiv_10000'
    customEpochs = 10
    modelFull = default.MODEL_DIR + os.sep + f'{customFilename}_epoch{customEpochs}_best.pth'
    modelWeights = default.MODEL_DIR + os.sep + f'{customFilename}_weights_epoch{customEpochs}_best.pth'
    
    # approach 1: load model (class) entirely (uses pickle)
    modelFullLoad = torch.load(modelFull, map_location=device)

    # approach 2: load model weights, need to have some parameter or something 
    modelLoad = TransformerModel(vocabSize, emsize, nhead, nhid, nlayers, dropout).to(device)
    modelWeightsLoad = modelLoad.load_state_dict( torch.load(modelWeights) )

In [None]:
# inspect both models
#print('model_A info...\n', modelFullLoad)
#print('\nmodel_B info...\n', modelWeightsLoad)

#print('model_A == model_B:', modelFullLoad == modelWeightsLoad)
#model = modelFullLoad
# Text generation example

#model = modelLoad
prompt = 'The dog ran'
ngen = 100
decode_style = 'sample_topp' #greedy, sample_topp
model.to('cpu')
generated_text = gen_some_text(
    best_model, dataset.transform, 'cpu', maxLen, text_prompt=prompt, tokens_to_gen=ngen, vis=False,
    decode_style=decode_style)
print("Text prompt:\n", prompt)
print("Number of tokens to generate:", ngen)
print("Generated_text:\n", generated_text)

# TODO: alternative generation
# currently 'greedy method'
# see: https://huggingface.co/blog/how-to-generate