In [1]:
#!pip install --user tensorboardX

In [2]:
# code based off of 
# https://github.com/mandubian/pytorch_math_dataset and
# https://github.com/lucidrains/reformer-pytorch

import math
import numpy as np
import torch
from torch.utils import data
import torch.optim as optim
import tqdm as tqdm
import random
from datetime import datetime
from apex import amp
import pickle


import mandubian.math_dataset
from mandubian.math_dataset import MathDatasetManager
from mandubian.transformer import Constants

# from transformer.Models import Transformer
from mandubian.math_dataset import (
    random_split_dataset,
    question_answer_to_mask_batch_collate_fn
)
from mandubian.math_dataset import np_encode_string, np_decode_string
import mandubian.model_process
import mandubian.utils
from mandubian.tensorboard_utils import Tensorboard
from mandubian.tensorboard_utils import tensorboard_event_accumulator

import mandubian.checkpoints

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

from datetime import datetime

print("Torch Version", torch.__version__)

%load_ext autoreload
%autoreload 2

Using backend: pytorch


Torch Version 1.5.0


# Check hardware

In [3]:
seed = 1
print(torch.cuda.device_count(), "detected CUDA devices")
cuda_device = torch.cuda.current_device()
print("Using CUDA device: ", cuda_device)
print(torch.cuda.get_device_name(cuda_device))

torch.manual_seed(seed)
device = torch.device("cuda")
print("device", device)

1 detected CUDA devices
Using CUDA device:  0
GeForce RTX 2080
device cuda


# Reformer library

In [4]:
from lucidrains_reformer.reformer_pytorch import ReformerLM, Autopadder, Recorder
from lucidrains_reformer.reformer_pytorch import ReformerEncDec
from lucidrains_reformer.reformer_pytorch.generative_tools import TrainingWrapper

# Initialize Math Dataset Manager

In [5]:
mdsmgr = MathDatasetManager(
  "/home/jonathan/Repos/final_year_at_ic/awesome_project/mathematics_dataset-v1.0/"
)
# Examine dataset structure
print("mdsmgr structure", dir(mdsmgr))

initialized MultiFilesMathDataset with categories ['algebra', 'numbers', 'polynomials', 'comparison', 'arithmetic', 'measurement', 'probability', 'calculus'] and types ['train-easy', 'train-medium', 'train-hard', 'interpolate', 'extrapolate']
mdsmgr structure ['__add__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_build_datasets_from_category', 'build_dataset_from_categories', 'build_dataset_from_category', 'build_dataset_from_module', 'build_dataset_from_modules', 'dfs', 'dirs', 'get_categories', 'get_modules_for_category', 'get_types', 'root_dir']


In [6]:
# print(MathDatasetManager.__dir__
mdsmgr._build_datasets_from_category

<bound method MathDatasetManager._build_datasets_from_category of <mandubian.math_dataset.MathDatasetManager object at 0x7efb71342c18>>

### Check availables types, problem categories and problem subcategories

In [7]:
print("types", list(mdsmgr.get_types()))
print("categories", list(mdsmgr.get_categories()))
print("modules of arithmetic", mdsmgr.get_modules_for_category('arithmetic'))


types ['train-easy', 'train-medium', 'train-hard', 'interpolate', 'extrapolate']
categories ['algebra', 'numbers', 'polynomials', 'comparison', 'arithmetic', 'measurement', 'probability', 'calculus']
modules of arithmetic dict_keys(['div', 'nearest_integer_root', 'mul_div_multiple', 'mul', 'add_or_sub', 'add_sub_multiple', 'mixed', 'add_or_sub_in_base', 'simplify_surd', 'add_or_sub_big', 'add_sub_multiple_longer', 'mixed_longer', 'div_big', 'mul_div_multiple_longer', 'mul_big'])


### Ways to manipulate dataset

In [8]:
# # Build Dataset from a single module in a category
ds = mdsmgr.build_dataset_from_module('arithmetic', 'add_or_sub', 'train-easy')
print("size", len(ds))

# # Build Dataset from a single module in a category with limited number of elements
# ds = mdsmgr.build_dataset_from_module('arithmetic', 'add_or_sub', 'train-easy', max_elements=1000)
# print("size", len(ds))

# # Build Dataset from several modules in a category
# ds = mdsmgr.build_dataset_from_modules('arithmetic', ['add_or_sub', 'add_sub_multiple'], 'train-easy')
# print("size", len(ds))

# # Build Dataset from all modules in a category
# ds = mdsmgr.build_dataset_from_category('arithmetic', 'train-easy')
# ds = mdsmgr.build_dataset_from_category('arithmetic', 'interpolate')
# print("size", len(ds))

# # Build Dataset from all modules in several categories
# ds = mdsmgr.build_dataset_from_categories(['arithmetic', 'polynomials'], 'train-easy')
# print("size", len(ds))

# # 

size 666666


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [9]:
# Pytorch initialization

# Start baseline

In [10]:
exp_name = "baseline_64"
now = datetime.now()
unique_id = now.strftime("%m-%d-%Y_%H-%M-%S")
base_dir = "/home/jonathan/Repos/final_year_at_ic/awesome_project/code/tests/"

## Constants

In [11]:
from mandubian.math_dataset import (
    VOCAB_SZ, MAX_QUESTION_SZ, MAX_ANSWER_SZ
)

NUM_CPU_THREADS = 12
BATCH_SIZE = 128
NUM_BATCHES = int(1e5)
BATCH_SIZE = 32
GRADIENT_ACCUMULATE_EVERY = 1
LEARNING_RATE = 1e-4
VALIDATE_EVERY  = 20
GENERATE_EVERY  = 60
GENERATE_LENGTH = 32

# hyperparameters need updates

Q_SEQ_LEN = 256
A_SEQ_LEN = 30 # unused due to requirements of axial_positon_shape
NUM_TOKENS = VOCAB_SZ + 1
D_MODEL = 512
EMB_DIM = D_MODEL
NUM_HEADS = 8
QKV_DIM = D_MODEL / NUM_HEADS
NUM_LAYERS = 6
D_FF = 2048


### Get training and test data

In [12]:
# training data
# training_data = mdsmgr.build_dataset_from_category('arithmetic','train-easy') # for now
training_data = mdsmgr.build_dataset_from_modules('arithmetic', ['add_or_sub', 'add_sub_multiple'], 'train-easy')

# testing data
# testing_data_interpolate = mdsmgr.build_dataset_from_category('arithmetic','interpolate')
# testing_data_extrapolate = mdsmgr.build_dataset_from_category('arithmetic','extrapolate')

testing_data_interpolate = mdsmgr.build_dataset_from_modules('arithmetic', ['add_or_sub', 'add_sub_multiple'], 'interpolate', max_elements = 1024)
# testing_data_extrapolate = mdsmgr.build_dataset_from_modules('arithmetic', ['add_or_sub', 'add_sub_multiple'], 'extrapolate')


In [13]:
# from lucidrains_reformer.examples.enwik8_simple.train
# helpers

def cycle(loader):
    while True:
        for data in loader:
            yield data

def decode_token(token):
    return str(chr(max(32, token)))

def decode_tokens(tokens):
    return ''.join(list(map(decode_token, tokens)))

def get_non_pad_mask(seq):
    # returns true when token is not PAD and false otherwise
    assert seq.dim() == 2
    return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1)

# get data splits
train_ds, val_ds = mandubian.math_dataset.random_split_dataset(training_data,split_rate=0.9)

# get pytorch dataloaders
# Questions are padded in question_answer_to_position_batch_collate_fn
train_loader = data.DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_CPU_THREADS,
    collate_fn=question_answer_to_mask_batch_collate_fn, pin_memory = True)
train_loader = cycle(train_loader)

val_loader = data.DataLoader(
    val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_CPU_THREADS,
    collate_fn=question_answer_to_mask_batch_collate_fn, pin_memory = True)
val_loader = cycle(val_loader)

# for viewing output sequences
gen_loader = data.DataLoader(
    val_ds, batch_size=1, shuffle=False, num_workers=NUM_CPU_THREADS,
    collate_fn=question_answer_to_mask_batch_collate_fn, pin_memory = True)
gen_loader = cycle(gen_loader)

interpolate_loader = data.DataLoader(
    testing_data_interpolate, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_CPU_THREADS,
    collate_fn=question_answer_to_mask_batch_collate_fn, pin_memory = True)
interpolate_loader = cycle(interpolate_loader)

# extrapolate_loader = data.DataLoader(
#     testing_data_extrapolate, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_CPU_THREADS,
#     collate_fn=question_answer_to_mask_batch_collate_fn)
# extrapolate_loader = cycle(extrapolate_loader)


### Model

In [14]:
# define model

enc_dec = ReformerEncDec(
    dim = D_MODEL,
    enc_num_tokens = NUM_TOKENS,
    enc_depth = NUM_LAYERS,
    enc_max_seq_len = Q_SEQ_LEN,
    dec_num_tokens = NUM_TOKENS,
    dec_depth = NUM_LAYERS,
    dec_max_seq_len = Q_SEQ_LEN,
    # heads = 8 by default
    axial_position_shape = (16, 16),  # the shape must multiply up to the max_seq_len (128 x 64 = 8192)
    axial_position_dims = (256,256),   # the dims must sum up to the model dimensions (512 + 512 = 1024)
    pad_value = Constants.PAD,
    ignore_index = Constants.PAD # see if this works. pad_value and ignore_index are probably different
).cuda()

# enc_dec = Recorder(enc_dec)
enc_dec.to(device)


ReformerEncDec(
  (enc): TrainingWrapper(
    (net): Autopadder(
      (net): ReformerLM(
        (token_emb): Embedding(96, 512, padding_idx=0)
        (to_model_dim): Identity()
        (pos_emb): AxialPositionalEncoding(
          (weights): ParameterList(
              (0): Parameter containing: [torch.cuda.FloatTensor of size 1x4x1x256 (GPU 0)]
              (1): Parameter containing: [torch.cuda.FloatTensor of size 1x1x64x256 (GPU 0)]
          )
        )
        (reformer): Reformer(
          (layers): ReversibleSequence(
            (blocks): ModuleList(
              (0): ReversibleBlock(
                (f): Deterministic(
                  (net): PreNorm(
                    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
                    (fn): LSHSelfAttention(
                      (toqk): Linear(in_features=512, out_features=512, bias=False)
                      (tov): Linear(in_features=512, out_features=512, bias=False)
                      (to_out)

## Optimizer learning rate scheduler, mixed precision setup

In [15]:
optimizer = optim.Adam(enc_dec.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.995), eps=1e-9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.3, patience=100, verbose=True)

# mixed precision
enc_dec, optimizer = amp.initialize(enc_dec, optimizer, opt_level='O2')

Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic


# Train

In [16]:
# for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
i = 0

train_loss_list = []
val_loss_list = []

In [None]:
while True:
    
    # exclude the 0th element as it is BOS
    
    if (i % GENERATE_EVERY) - 1 == 0:
        enc_dec.eval()
        gen_qs, gen_qs_mask, gen_as, gen_as_mask = next(gen_loader)
        prime = np_decode_string(gen_qs.numpy())
        print('*' * 100, "\nQuestion: ", prime)
        print("Actual Answer: ", np_decode_string(gen_as.numpy()))
        gen_qs = gen_qs.to(device, non_blocking=True)
        gen_as = gen_as.to(device, non_blocking=True)
        gen_qs_mask = gen_qs_mask.to(device, non_blocking=True)
        with torch.no_grad():
            sample = enc_dec.generate(gen_qs, gen_as[:,0:1], GENERATE_LENGTH, enc_input_mask = gen_qs_mask, dec_eos_token=Constants.EOS)
        sample = sample.cpu().numpy()
        output_str = np_decode_string(sample)
        print("Decoded Prediction: ", output_str)
        np.savetxt(base_dir + "logs/" + exp_name + "_" + unique_id + "-train_loss.txt", train_loss_list)
        np.savetxt(base_dir + "logs/" + exp_name + "_" + unique_id + "-val_loss.txt", val_loss_list)
            

    enc_dec.train()
    train_loss_record = 0
    for __ in range(GRADIENT_ACCUMULATE_EVERY):
        batch_qs, batch_qs_mask, batch_as, batch_as_mask = map(lambda x: x.to(device, non_blocking=True), next(train_loader))
        train_loss = enc_dec(batch_qs, batch_as, return_loss = True, enc_input_mask = batch_qs_mask)
        del batch_qs, batch_qs_mask, batch_as, batch_as_mask
        with amp.scale_loss(train_loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        train_loss_record += float(train_loss)
        del train_loss

#     if i % GRADIENT_ACCUMULATE_EVERY == 0:
    train_loss_record /= 4
    print("Step ", i, "\t", f'training loss: {train_loss_record}', "\t", datetime.now().time() )
    train_loss_list.append((i, train_loss_record))
    torch.nn.utils.clip_grad_norm_(enc_dec.parameters(), 0.5)
    optimizer.step()
    optimizer.zero_grad()
    scheduler.step(train_loss_record)

    if i % VALIDATE_EVERY == 0:
        enc_dec.eval()
        val_batch_qs, val_batch_qs_mask, val_batch_as, val_batch_as_mask = map(lambda x: x.to(device, non_blocking=True), next(val_loader))
        with torch.no_grad():
            val_loss = enc_dec(val_batch_qs, val_batch_as, return_loss = True, enc_input_mask = val_batch_qs_mask)
            print(f'validation loss: {val_loss.item()}')
            val_loss_list.append((i, val_loss.item()))
    i+=1


Step  0 	 training loss: 1.2412109375 	 20:09:50.942948
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
validation loss: 5.04296875
**************************************************************************************************** 
Question:  !Evaluate 13 + (4 + -16 - 9)."                                                                                                                                                                                                                                  
Actual Answer:  !-8"                                                                                                                                                                                                                                                            
Decoded Prediction:  ^{AkeW&$0c~ {^)K^) #8U&,twPAj\Aj
Step  1 	 training loss: 1.2724609375 	 20:09:57.028725
Step  2 	 training loss: 1.0400390625 	 20:09:59.912472
Step  3 	 training loss: 0.

In [None]:
plt.plot([train_loss_list[1]])
plt.show