In [1]:
import numpy as np
import time
import torch
import torch
import sys
sys.path.append('pytorch')
sys.path.append('pytorch/utils')

from pytorch.mem_transformer import *
from pytorch import data_utils

from experiment_utils.run_experiment import *
from experiment_utils.generate_data import *

## Variables

In [2]:
from sklearn.model_selection import ParameterGrid

TAG = '10tkn_len24_ext'

TASK_NAME = 'copy'
TRAIN_SIZE = 100_000
VAL_SIZE = 2_000
TEST_SIZE = 10_000
NUM_INITS = 1


NUM_BATCHES = int(4e5)
BATCH_SIZE = 128
GENERATE_EVERY  = 10000
NUM_TOKENS = 10 + 2
ENC_SEQ_LEN = 24
DEC_SEQ_LEN = 48

INPUT_LEN = 24

#### Generate data

In [3]:
# np.random.seed(42)

# generator = copy_generator(batch_size=BATCH_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN, num_tokens=NUM_TOKENS)
# generate_data(generator, path=f'data{INPUT_LEN}', task_name=TASK_NAME, train_size=TRAIN_SIZE, test_size=TEST_SIZE, val_size=VAL_SIZE, batch_size=BATCH_SIZE)  

In [4]:
class data_loader:
    def __init__(self, task_name, path='data', batch_size=32, none_mask=True):
        self.X, self.y = np.load(f'{path}/{task_name}_X.npy'), np.load(f'{path}/{task_name}_y.npy')
        self.data_size = self.X.shape[0]
        self.data_ptr = 0

        if none_mask:
            self.src_mask, self.tgt_mask = None, None
        else:
            self.src_masks, self.tgt_mask = np.load(f'{path}/{task_name}_mask.npy'), None

        self.batch_size = batch_size
        self.none_mask = none_mask

    def __next__(self):
        X = self.X[self.data_ptr: self.data_ptr+self.batch_size]
        y = self.y[self.data_ptr: self.data_ptr+self.batch_size]
        
        if not self.none_mask:
            sm = self.src_masks[self.data_ptr: self.data_ptr+self.batch_size]
            sm = torch.tensor(sm).cuda()
        else:
            sm = None
            
        self.data_ptr = (self.data_ptr + self.batch_size) % self.data_size

        return torch.tensor(X),\
                torch.tensor(y),\
                sm, self.tgt_mask

### Run

In [23]:
model_parameters = ParameterGrid({
                'n_layer': [2],
                'n_head': [4],
                'd_head': [128],
                'num_mem_tokens': [9, 0], 
                'mem_len': [0]})

param = list(model_parameters)[0]

fixed_parameters = {'n_token': NUM_TOKENS,
                    'd_model': param['d_head'],# + param['num_mem_tokens']-1,
                    'd_inner': param['d_head'],
                    'dropout': 0,
                    'dropatt': 0,
                    'tie_weight': True,
                    'div_val': 1, # ???????
                    'tie_projs': [False],
                    'tgt_len': DEC_SEQ_LEN,
                    'ext_len': 0, 
                    'cutoffs': [],
                    'attn_type': 0,}

model = MemTransformerLM(**param, **fixed_parameters)#.cuda()

gen_train = data_loader(path=f'data{INPUT_LEN}', task_name=f'{TASK_NAME}_train', batch_size=BATCH_SIZE)
src, tgt, _, _ = next(gen_train)
# src, tgt = src.cuda(), tgt.cuda()
src, tgt = src.cpu(), tgt.cpu()

mems = tuple()
# model(src.cpu(), tgt.cpu(), *mems)


In [54]:
# rep = src[0].repeat(16, 1)
# hidden, new_mems = model._forward(rep)

mems = None
# dec_inp = src[0].repeat(21, 1).T
dec_inp = src[:21].T
self = model


In [56]:

word_emb = self.word_emb(dec_inp)

mlen = mems[0].size(0) if mems is not None else 0

# Concat with mem_tokens
if self.num_mem_tokens not in (0, None):
    # memory = self.mem_tokens.repeat(1, dec_inp.shape[0], 1).clone()
    memory = self.mem_tokens.reshape(self.num_mem_tokens, 1, -1).repeat(1, dec_inp.shape[1], 1)
    word_emb = torch.cat((memory, word_emb), dim=0)


In [57]:

# qlen, bsz = dec_inp.size()
qlen = word_emb.shape[0]
klen = mlen + qlen
if self.same_length:
    all_ones = word_emb.new_ones(qlen, klen)
    mask_len = klen - self.mem_len
    if mask_len > 0:
        mask_shift_len = qlen - mask_len
    else:
        mask_shift_len = qlen
    dec_attn_mask = (torch.triu(all_ones, 1+mlen)
            + torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1
else:
    dec_attn_mask = torch.triu(
        word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None]

hids = []
if self.attn_type == 0: # default
    pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, 
                            dtype=word_emb.dtype)
    if self.clamp_len > 0:
        pos_seq.clamp_(max=self.clamp_len)
    pos_emb = self.pos_emb(pos_seq)

    core_out = self.drop(word_emb)
    pos_emb = self.drop(pos_emb)

    hids.append(core_out)
    for i, layer in enumerate(self.layers):
        mems_i = None if mems is None else mems[i]
        core_out = layer(core_out, pos_emb, self.r_w_bias,
                self.r_r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
        hids.append(core_out)
elif self.attn_type == 1: # learnable
    core_out = self.drop(word_emb)
    hids.append(core_out)
    for i, layer in enumerate(self.layers):
        if self.clamp_len > 0:
            r_emb = self.r_emb[i][-self.clamp_len :]
            r_bias = self.r_bias[i][-self.clamp_len :]
        else:
            r_emb, r_bias = self.r_emb[i], self.r_bias[i]

        mems_i = None if mems is None else mems[i]
        core_out = layer(core_out, r_emb, self.r_w_bias[i],
                r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
        hids.append(core_out)
elif self.attn_type == 2: # absolute
    pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
                            dtype=word_emb.dtype)
    if self.clamp_len > 0:
        pos_seq.clamp_(max=self.clamp_len)
    pos_emb = self.pos_emb(pos_seq)

    core_out = self.drop(word_emb + pos_emb[-qlen:])

    hids.append(core_out)
    for i, layer in enumerate(self.layers):
        mems_i = None if mems is None else mems[i]
        if mems_i is not None and i == 0:
            mems_i += pos_emb[:mlen]
        core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
                            mems=mems_i)
        hids.append(core_out)
elif self.attn_type == 3:
    core_out = self.drop(word_emb)

    hids.append(core_out)
    for i, layer in enumerate(self.layers):
        mems_i = None if mems is None else mems[i]
        if mems_i is not None and mlen > 0:
            cur_emb = self.r_emb[i][:-qlen]
            cur_size = cur_emb.size(0)
            if cur_size < mlen:
                cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1)
                cur_emb = torch.cat([cur_emb_pad, cur_emb], 0)
            else:
                cur_emb = cur_emb[-mlen:]
            mems_i += cur_emb.view(mlen, 1, -1)
        core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)

        core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
                            mems=mems_i)
        hids.append(core_out)

core_out = self.drop(core_out)

new_mems = self._update_mems(hids, mems, mlen, qlen)

  attn_score = attn_score.float().masked_fill(


In [58]:
pos_seq.shape

torch.Size([33])

In [59]:
dec_inp.shape

torch.Size([24, 21])

In [60]:
pos_emb.shape

torch.Size([33, 1, 128])

In [61]:
word_emb.shape

torch.Size([33, 21, 128])

In [62]:
core_out.shape, core_out.isnan().sum()

(torch.Size([33, 21, 128]), tensor(0))

In [63]:
# model.layers[0](hids[0],pos_emb,  self.r_w_bias,
#                 self.r_r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)

In [64]:
new_mems

In [65]:
mlen, qlen, klen

(0, 33, 33)

In [66]:
pos_emb.shape, pos_seq.shape

(torch.Size([33, 1, 128]), torch.Size([33]))

In [68]:
core_out.shape, core_out

(torch.Size([33, 21, 128]),
 tensor([[[-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878],
          [-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878],
          [-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878],
          ...,
          [-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878],
          [-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878],
          [-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878]],
 
         [[-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878],
          [-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878],
          [-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878],
          ...,
          [-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878],
          [-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878],
          [-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878]],
 
         [[-0.7845,  0.5875, -1.3231,  ..., -1.7092, -1.0594, -0.4878],


In [72]:
core_out[0][0] - core_out[1][0]

tensor([ 1.1921e-07,  1.7881e-07, -1.1921e-07, -3.5763e-07,  1.0431e-07,
        -1.7881e-07,  0.0000e+00,  5.9605e-08, -2.3842e-07, -2.3842e-07,
         1.7136e-07, -3.5763e-07,  2.3842e-07,  8.1956e-08,  0.0000e+00,
        -5.9605e-08,  2.3842e-07,  5.9605e-08, -2.9802e-08, -2.9802e-07,
        -2.9802e-07,  1.1921e-07, -1.1921e-07,  2.9802e-08,  1.8626e-08,
         6.7055e-08, -2.3842e-07,  1.0431e-07, -2.3842e-07,  0.0000e+00,
         1.7881e-07,  1.3411e-07, -8.1956e-08, -2.9802e-08, -1.7881e-07,
        -1.1921e-07, -2.9802e-08, -2.9802e-07,  2.9802e-08,  0.0000e+00,
         1.1921e-07,  2.3842e-07, -2.3842e-07, -5.9605e-08,  1.1921e-07,
        -1.6391e-07,  2.3842e-07, -3.5763e-07,  2.3842e-07, -2.9802e-08,
         8.9407e-08,  2.9802e-08,  8.9407e-08,  1.1921e-07, -1.3411e-07,
        -8.9407e-08,  0.0000e+00, -5.9605e-08, -4.7684e-07,  0.0000e+00,
        -1.1921e-07, -2.3842e-07,  1.4901e-07,  0.0000e+00, -2.3842e-07,
        -2.3842e-07, -1.1921e-07,  1.1921e-07,  2.9

In [50]:
pos_emb

tensor([[[ 0.5514,  0.5341, -0.9070,  ...,  1.0000,  1.0000,  1.0000]],

        [[-0.4040,  0.9900, -0.9507,  ...,  1.0000,  1.0000,  1.0000]],

        [[-0.9880,  0.7488, -0.4844,  ...,  1.0000,  1.0000,  1.0000]],

        ...,

        [[ 0.9093,  0.9870,  0.9975,  ...,  1.0000,  1.0000,  1.0000]],

        [[ 0.8415,  0.7617,  0.6816,  ...,  1.0000,  1.0000,  1.0000]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  1.0000,  1.0000,  1.0000]]])

In [49]:
core_out[:, 0] - core_out[:, 1]

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<SubBackward0>)

In [54]:
hidden[0][0] - hidden[0][8]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.], grad_fn=<SubBackward0>)

In [47]:
hidden[0]

tensor([[ 0.6850, -0.5999, -0.9625,  ...,  0.8854, -0.2791,  0.3821],
        [ 0.6850, -0.5999, -0.9625,  ...,  0.8854, -0.2791,  0.3821],
        [ 0.6850, -0.5999, -0.9625,  ...,  0.8854, -0.2791,  0.3821],
        ...,
        [-0.6816,  0.3168,  2.0647,  ..., -0.2411,  0.9336, -0.9197],
        [-1.6048, -1.4270, -0.2114,  ...,  0.3779,  0.5680,  1.2500],
        [-0.8432, -0.6461, -0.9086,  ..., -0.2625, -0.3066, -1.0324]],
       grad_fn=<SelectBackward>)

In [18]:
class TransformerXL(nn.Module):
    def __init__(self, enc_kwargs, dec_kwargs):
        super().__init__()
        
        self.Encoder = MemTransformerLM(**enc_kwargs)
        self.Decoder = MemTransformerLM(**dec_kwargs)

        self.enc_kwargs = enc_kwargs
        self.dec_kwargs = dec_kwargs

    def forward(self, src, tgt, mems=None):
        hidden, mems = self.Encoder(src, )


In [14]:
src.device

device(type='cpu')

In [15]:
model.mem_tokens.shape, model.mem_tokens.repeat(2, 1, 1).shape

(torch.Size([24, 128]), torch.Size([2, 24, 128]))

In [16]:
src.shape, tgt.shape

(torch.Size([128, 24]), torch.Size([128, 49]))

In [10]:
model.layers

ModuleList(
  (0): RelPartialLearnableDecoderLayer(
    (dec_attn): RelPartialLearnableMultiHeadAttn(
      (qkv_net): Linear(in_features=128, out_features=1536, bias=False)
      (drop): Dropout(p=0, inplace=False)
      (dropatt): Dropout(p=0, inplace=False)
      (o_net): Linear(in_features=512, out_features=128, bias=False)
      (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (r_net): Linear(in_features=128, out_features=512, bias=False)
    )
    (pos_ff): PositionwiseFF(
      (CoreNet): Sequential(
        (0): Linear(in_features=128, out_features=128, bias=True)
        (1): ReLU(inplace=True)
        (2): Dropout(p=0, inplace=False)
        (3): Linear(in_features=128, out_features=128, bias=True)
        (4): Dropout(p=0, inplace=False)
      )
      (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
  )
  (1): RelPartialLearnableDecoderLayer(
    (dec_attn): RelPartialLearnableMultiHeadAttn(
      (qkv_net): Linear(in_feature

In [88]:
model(src, tgt, )

AttributeError: 'tuple' object has no attribute 'size'

In [1]:

src.shape, src.repeat((1, 1, 2)).shape

NameError: name 'src' is not defined

In [10]:
next(model.parameters()).device

device(type='cuda', index=0)

In [10]:
LEARNING_RATE = 0.0007

model_parameters = ParameterGrid({'dim': [128],
    'tie_token_embeds': [True],
    'return_tgt_loss': [True],
    'enc_num_tokens': [NUM_TOKENS],
    'depth,heads': [(2, 4)],
    'enc_max_seq_len': [24],
    'dec_num_tokens': [NUM_TOKENS],
    'dec_max_seq_len': [DEC_SEQ_LEN],
    'enc_num_memory_tokens': [2, 8, 0]})

print('Total runs: ', NUM_INITS * len(model_parameters))

Total runs:  3


In [None]:
gen_train = data_loader(path=f'data{INPUT_LEN}', task_name=f'{TASK_NAME}_train', batch_size=BATCH_SIZE)
gen_val = data_loader(path=f'data{INPUT_LEN}', task_name=f'{TASK_NAME}_val', batch_size=VAL_SIZE)
gen_test = data_loader(path=f'data{INPUT_LEN}', task_name=f'{TASK_NAME}_test', batch_size=TEST_SIZE)


print_file = f'logs/{TASK_NAME}_{TAG}_memory_logs.txt'
t = time.time()
with torch.cuda.device(0):
    for init_num in range(NUM_INITS):
        with open(print_file, 'a') as f:
            f.write('\n\nInit number ' + str(init_num)+'\n')
        for i, param in enumerate(list(model_parameters)):
            with open(print_file, 'a') as f:
                f.write('\n\n' + str(param)+'\n')
            param['enc_depth'], param['enc_heads'] = param['depth,heads']
            param['dec_depth'], param['dec_heads'] = param['depth,heads']
            param.pop('depth,heads')

            with open(print_file, 'a') as f:
                f.write(f'{i / len(model_parameters) * 100}%')
            model = TransformerXL
            model = XTransformer(**param).cuda()

            model_name = f"{TASK_NAME}{INPUT_LEN}_dim{param['dim']}d{param['enc_depth']}h{param['enc_heads']}M{param['enc_num_memory_tokens']}l{param['enc_max_seq_len']}_{TAG}_v{init_num}"

            optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
            train_validate_model(model, 
                            train_generator=gen_train, 
                            val_generator=gen_val, 
                            optim=optim, 
                            model_name=model_name, 
                            config=param,
                            num_batches=NUM_BATCHES,
                            generate_every=GENERATE_EVERY,
                            print_file=print_file,
                            tag=TAG,
                            overfit_stop=False)
            test_model(model, gen_test, model_name, param, TASK_NAME, tag=TAG)
            with open(print_file, 'a') as f:
                f.write(f'\nTotal time: {time.time() - t}\n')
            t = time.time()

mkdir: 