In [2]:
import numpy as np
import time
import torch
import torch
import sys
sys.path.append('pytorch')
sys.path.append('pytorch/utils')

from pytorch.mem_transformer import *
from pytorch import data_utils

from experiment_utils.run_experiment import *
from experiment_utils.generate_data import *

## Variables

In [3]:
from sklearn.model_selection import ParameterGrid

TAG = '10tkn_len24_ext'

TASK_NAME = 'copy'
TRAIN_SIZE = 1000
VAL_SIZE = 200
TEST_SIZE = 100
NUM_INITS = 1


NUM_BATCHES = int(4e5)
BATCH_SIZE = 128
GENERATE_EVERY  = 10000
NUM_TOKENS = 10 + 2
ENC_SEQ_LEN = 24
DEC_SEQ_LEN = 48

INPUT_LEN = 24

#### Generate data

In [4]:
# # !mkdir data24
# np.random.seed(42)

# generator = copy_generator(batch_size=BATCH_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN, num_tokens=NUM_TOKENS)
# generate_data(generator, path=f'data{INPUT_LEN}', task_name=TASK_NAME, train_size=TRAIN_SIZE, test_size=TEST_SIZE, val_size=VAL_SIZE, batch_size=BATCH_SIZE)  

In [5]:
class data_loader:
    def __init__(self, task_name, path='data', batch_size=32, none_mask=True):
        self.X, self.y = np.load(f'{path}/{task_name}_X.npy'), np.load(f'{path}/{task_name}_y.npy')
        self.data_size = self.X.shape[0]
        self.data_ptr = 0

        if none_mask:
            self.src_mask, self.tgt_mask = None, None
        else:
            self.src_masks, self.tgt_mask = np.load(f'{path}/{task_name}_mask.npy'), None

        self.batch_size = batch_size
        self.none_mask = none_mask

    def __next__(self):
        X = self.X[self.data_ptr: self.data_ptr+self.batch_size]
        y = self.y[self.data_ptr: self.data_ptr+self.batch_size]
        
        if not self.none_mask:
            sm = self.src_masks[self.data_ptr: self.data_ptr+self.batch_size]
            sm = torch.tensor(sm).cuda()
        else:
            sm = None
            
        self.data_ptr = (self.data_ptr + self.batch_size) % self.data_size

        return torch.tensor(X),\
                torch.tensor(y),\
                sm, self.tgt_mask

### Run

In [6]:
model_parameters = ParameterGrid({
                'n_layer': [2],
                'n_head': [4],
                'd_head': [128],
                'num_mem_tokens': [9, 0], 
                'mem_len': [0]})

param = list(model_parameters)[0]

fixed_parameters = {'n_token': NUM_TOKENS,
                    'd_model': param['d_head'],# + param['num_mem_tokens']-1,
                    'd_inner': param['d_head'],
                    'dropout': 0,
                    'dropatt': 0,
                    'tie_weight': True,
                    'div_val': 1, # ???????
                    'tie_projs': [False],
                    'tgt_len': DEC_SEQ_LEN,
                    'ext_len': 0, 
                    'cutoffs': [],
                    'attn_type': 0,}

model = MemTransformerLM(**param, **fixed_parameters)#.cuda()

gen_train = data_loader(path=f'data{INPUT_LEN}', task_name=f'{TASK_NAME}_train', batch_size=21)
src, tgt, _, _ = next(gen_train)
# src, tgt = src.cuda(), tgt.cuda()
src, tgt = src.cpu().T, tgt.cpu().T

mems = tuple()
# model(src, tgt.contiguous(), *mems)


In [7]:
model.mem_tokens

In [16]:
out = model(src, src.contiguous())
mem_tokens, loss, mems = out[0], out[1], out[2:]

  attn_score = attn_score.float().masked_fill(


In [None]:
gen_train = data_loader(path=f'data{INPUT_LEN}', task_name=f'{TASK_NAME}_train', batch_size=BATCH_SIZE)
gen_val = data_loader(path=f'data{INPUT_LEN}', task_name=f'{TASK_NAME}_val', batch_size=VAL_SIZE)
gen_test = data_loader(path=f'data{INPUT_LEN}', task_name=f'{TASK_NAME}_test', batch_size=TEST_SIZE)


print_file = f'logs/{TASK_NAME}_{TAG}_memory_logs.txt'
t = time.time()
with torch.cuda.device(0):
    for init_num in range(NUM_INITS):
        with open(print_file, 'a') as f:
            f.write('\n\nInit number ' + str(init_num)+'\n')
        for i, param in enumerate(list(model_parameters)):
            with open(print_file, 'a') as f:
                f.write('\n\n' + str(param)+'\n')
            param['enc_depth'], param['enc_heads'] = param['depth,heads']
            param['dec_depth'], param['dec_heads'] = param['depth,heads']
            param.pop('depth,heads')

            with open(print_file, 'a') as f:
                f.write(f'{i / len(model_parameters) * 100}%')
            model = TransformerXL
            model = XTransformer(**param).cuda()

            model_name = f"{TASK_NAME}{INPUT_LEN}_dim{param['dim']}d{param['enc_depth']}h{param['enc_heads']}M{param['enc_num_memory_tokens']}l{param['enc_max_seq_len']}_{TAG}_v{init_num}"

            optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
            train_validate_model(model, 
                            train_generator=gen_train, 
                            val_generator=gen_val, 
                            optim=optim, 
                            model_name=model_name, 
                            config=param,
                            num_batches=NUM_BATCHES,
                            generate_every=GENERATE_EVERY,
                            print_file=print_file,
                            tag=TAG,
                            overfit_stop=False)
            test_model(model, gen_test, model_name, param, TASK_NAME, tag=TAG)
            with open(print_file, 'a') as f:
                f.write(f'\nTotal time: {time.time() - t}\n')
            t = time.time()