In [1]:
import torch
import math
import os


if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

verbose = False

GPU: NVIDIA RTX A6000 is available.


## MeMo Tokenizer and input

In [2]:
from MeMoPyTorch.modelling_memo_tokenizer import MeMoTokenizer

In [3]:
max_length = 12 
tokenizer = MeMoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b", 
                                          truncation_side = 'left',
                                          padding_side='left', max_length=max_length, head_number=4)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPTNeoXTokenizer'. 
The class this function is called from is 'MeMoTokenizer'.


Setting pad token and pad token id = <|endoftext|>, 0


In [4]:
with open("testo_di_prova.txt") as my_first_text_f:
    my_first_text = my_first_text_f.read()

token_ids = tokenizer.encode(my_first_text)#, return_tensors='pt')
print(token_ids) # return max len + 1 

(tensor([[18886,   256, 36144,  4164,  1809,    80,  1448,   295,   532,  1584,
            13, 50190]]), tensor([[  256, 36144,  4164,  1809,    80,  1448,   295,   532,  1584,    13,
         50190,    15]]))


In [5]:
memo_input = tokenizer.get_text_batch_encoding([my_first_text, my_first_text[0:10]])
memo_input.keys(), memo_input['input_ids'].shape

(dict_keys(['input_ids', 'labels']), torch.Size([52, 12]))

In [6]:
for i in range(3):
    print(tokenizer.decode(memo_input['input_ids'][i]))
    print(tokenizer.decode(memo_input['labels'][i]))
    print()

<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Cosimo di
<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Cosimo di Giovanni

 de' Medici detto il Vecchio o Pater
' Medici detto il Vecchio o Pater patri

æ (Firenze, 27 settembre 1389
 (Firenze, 27 settembre 1389 –



## MeMo Embedding layer

In [7]:
from MeMoPyTorch.modelling_memo_embedding import MeMoEmbedding

In [8]:
d,h,l = 1024, 4, 3

In [9]:
embedding = MeMoEmbedding(
    num_embeddings=tokenizer.vocab_size,
    embedding_dim=d,
    padding_idx=tokenizer.pad_token_id, #0
    _freeze=True
)

MeMo embedding initilialization


In [10]:
input_tokens_ids = tokenizer(['Test', 'Un altro Test'])['input_ids']
print(input_tokens_ids)

input_embeddings = embedding.forward(input_tokens_ids)
input_embeddings

tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         5089],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0, 2447, 6945,  287,
         6004]])


tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [-0.0023,  0.0027, -0.0391,  ..., -0.0083, -0.0081, -0.0272]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [-0.0065, -0.0010,  0.0132,  ...,  0.0424, -0.0111,  0.0262],
         [ 0.0285, -0.0263,  0.0528,  ..., -0.0215,  0.0131,  0.0006],
         [ 0.0702,  0.0040,  0.0162,  ...,  0.0326, -0.0240, -0.0420]]])

In [11]:
memo_input = tokenizer.get_text_batch_encoding([my_first_text, my_first_text[10:30]])

memo_input['input_ids'].shape

torch.Size([52, 12])

In [12]:
memo_input['input_ids'][0:10]

tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0, 38577,
         17622,  1073],
        [  372,     8,  9718,    74,   843,   936,  4164, 43876, 41380,   258,
           367,   727],
        [ 5507,   313, 15723,   445,  2721,    13,  3435,  3414,   358,  3381,
         15410,    26],
        [ 9776,  1266,    74,    13,   337, 11703,   639, 39337,  1638,  1540,
            10, 12187],
        [  440,  2314,  4173,   299,  8913,  2942,   250,   352,  6770,    80,
            13,  2248],
        [  861,   410,   372, 32924,  1073, 33813,   445,  2721,   299,  2248,
            80,  1484],
        [ 1073,   659,  4611,  1073,   391,   300,   466,  5711, 14804,  1431,
           304, 19702],
        [   74,    15, 14929,  1327,  1323, 10081, 24843, 15438,   412, 16406,
         38055,  9821],
        [ 3737,  1073,   391,   300,   466,  5711, 39814,   260,   770,  5991,
           313,  1962],
        [18006, 22217, 42722, 10863,   262,  7958,  1593, 12704,  5940,  

In [13]:
input_embeddings = embedding.encode(memo_input['input_ids'])
output_symbols = embedding.encode(memo_input['labels'])

input_embeddings.shape, output_symbols.shape

(torch.Size([52, 12, 1024]), torch.Size([52, 12, 1024]))

In [14]:
decoded, _ = embedding.decode(input_embeddings)
print(decoded.shape)

decoded[0:10]

torch.Size([52, 12])


tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0, 38577,
         17622,  1073],
        [  372,     8,  9718,    74,   843,   936,  4164, 43876, 41380,   258,
           367,   727],
        [ 5507,   313, 15723,   445,  2721,    13,  3435,  3414,   358,  3381,
         15410,    26],
        [ 9776,  1266,    74,    13,   337, 11703,   639, 39337,  1638,  1540,
            10, 12187],
        [  440,  2314,  4173,   299,  8913,  2942,   250,   352,  6770,    80,
            13,  2248],
        [  861,   410,   372, 32924,  1073, 33813,   445,  2721,   299,  2248,
            80,  1484],
        [ 1073,   659,  4611,  1073,   391,   300,   466,  5711, 14804,  1431,
           304, 19702],
        [   74,    15, 14929,  1327,  1323, 10081, 24843, 15438,   412, 16406,
         38055,  9821],
        [ 3737,  1073,   391,   300,   466,  5711, 39814,   260,   770,  5991,
           313,  1962],
        [18006, 22217, 42722, 10863,   262,  7958,  1593, 12704,  5940,  

In [15]:
decoded[0:10] == memo_input['input_ids'][:10]

tensor([[True, True, True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True, True, True]])

In [16]:
sims = embedding.weight @ embedding.weight.T
display(sims)
diag_sum = torch.sum(sims[1:, 1: ].diag()) # almost 1 in each entry
print(diag_sum) # obs vs expected
print(torch.sum(sims[1:, 1:]) - diag_sum) #almost 0... more or less

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.9983, -0.0050,  ..., -0.0302, -0.0083, -0.0071],
        [ 0.0000, -0.0050,  0.9900,  ...,  0.0180,  0.0399,  0.0036],
        ...,
        [ 0.0000, -0.0302,  0.0180,  ...,  0.9862,  0.0286, -0.0203],
        [ 0.0000, -0.0083,  0.0399,  ...,  0.0286,  1.0189, -0.0050],
        [ 0.0000, -0.0071,  0.0036,  ..., -0.0203, -0.0050,  1.0564]])

tensor(50248.9219)
tensor(-1119.1719)


## Test layer and MeMo CMM

In [17]:
from MeMoPyTorch.modelling_memo_layer import MeMoLayer, ProjectionSequence, ProjectionTokens, CorrelationMatrixMemory

### Check initialization of each matrix

In [18]:
d,h,l = 1024, 4, 3
proj = ProjectionSequence(d, d*h)
print(proj.weight.shape, proj.extra_repr())
(proj.weight.T @ proj.weight).diag(), (proj.weight @ proj.weight.T).diag()

torch.Size([4096, 1024]) (trasposed wrt saved one) in_features=4096, out_features=1024


(tensor([1.0001, 0.9734, 1.0113,  ..., 1.0054, 1.0096, 1.0599],
        grad_fn=<DiagonalBackward0_copy>),
 tensor([0.2441, 0.2397, 0.2492,  ..., 0.2716, 0.2557, 0.2687],
        grad_fn=<DiagonalBackward0_copy>))

In [19]:
Prj = torch.normal(0, 1/math.sqrt(d*h), size=(d,d*h))
Prj = torch.transpose(Prj, 0, 1)
(Prj.T @ Prj).diag(), (Prj @ Prj.T).diag()

(tensor([1.0071, 1.0193, 0.9939,  ..., 1.0096, 0.9539, 0.9837]),
 tensor([0.2454, 0.2464, 0.2486,  ..., 0.2683, 0.2605, 0.2546]))

In [20]:
print(d)
print(h)

d_k = d // h

W_v = ProjectionTokens(d, d_k)

print(W_v.weight.shape, W_v.extra_repr())


### always used transposed! so check with .T
(W_v.weight.T @ W_v.weight).diag(), (W_v.weight @ W_v.weight.T).diag()

1024
4
torch.Size([256, 1024]) in_features=1024, out_features=256


(tensor([0.9130, 1.1256, 0.8835,  ..., 1.0044, 0.9630, 1.0328],
        grad_fn=<DiagonalBackward0_copy>),
 tensor([3.9768, 4.1737, 4.2684, 4.1132, 4.0868, 4.2425, 4.2230, 3.8434, 4.0362,
         4.0547, 4.0841, 4.1058, 3.8918, 4.0659, 3.9973, 3.9419, 3.9983, 4.0927,
         4.3244, 3.9605, 3.6646, 4.1196, 3.8822, 3.8914, 3.8621, 3.8363, 4.0533,
         3.8372, 3.9975, 4.1140, 3.9401, 4.1465, 3.9762, 4.3454, 3.9064, 4.2744,
         3.7955, 3.8557, 3.9269, 4.0063, 3.9984, 4.1154, 4.1589, 3.9601, 3.6307,
         4.0627, 3.7175, 4.1304, 3.7287, 4.1450, 4.0616, 4.3163, 4.1937, 3.9228,
         4.3139, 3.6658, 4.0031, 3.9427, 4.0584, 3.8599, 4.1947, 4.1495, 3.8413,
         4.2497, 3.9633, 3.5159, 3.7868, 4.0483, 4.1109, 4.2331, 4.2501, 3.8947,
         3.8700, 4.1005, 4.0520, 3.9067, 3.8875, 3.9716, 4.1152, 4.0778, 3.7914,
         4.0581, 4.0259, 3.6324, 4.0859, 3.7057, 4.0759, 4.3153, 3.7562, 4.2928,
         3.8849, 4.1671, 4.0208, 4.0959, 4.4941, 4.1431, 4.0250, 3.8631, 4.1924,
  

In [21]:
W_v_single_head = torch.normal(0, 1/math.sqrt(d_k), size=(d,d_k))

(W_v_single_head.T @ W_v_single_head).diag(), (W_v_single_head @ W_v_single_head.T).diag()

(tensor([3.8353, 3.9763, 4.3228, 3.8681, 3.8375, 4.1265, 3.8689, 3.8987, 4.2349,
         3.8796, 3.9763, 3.6021, 3.7336, 3.6950, 3.9160, 4.1376, 4.0095, 4.1417,
         3.9849, 4.2199, 3.8680, 3.6905, 3.9728, 4.2266, 4.3437, 4.0601, 3.8447,
         3.8382, 4.0107, 3.7765, 3.9026, 3.9907, 3.7311, 4.1689, 4.1451, 3.8639,
         3.8962, 4.3349, 3.9581, 3.9291, 3.9250, 3.9108, 3.7227, 3.8750, 4.0860,
         3.9869, 4.1752, 4.0651, 3.8667, 4.1384, 4.0423, 4.1828, 3.7566, 4.0609,
         3.9843, 4.1172, 4.0428, 4.3638, 4.0453, 4.1944, 3.9861, 3.9967, 3.5535,
         3.8496, 3.6559, 4.0801, 4.0982, 4.0738, 4.3134, 3.9767, 4.0875, 4.1507,
         3.6378, 4.0853, 3.9793, 4.2400, 4.0486, 4.2980, 3.8818, 4.0489, 4.1768,
         3.7635, 4.0278, 3.7948, 3.8342, 4.1616, 4.3265, 3.9240, 3.7758, 4.0395,
         3.8655, 3.8947, 4.0042, 3.8414, 3.9757, 3.8218, 3.8125, 4.1606, 3.9218,
         4.0527, 3.9117, 4.0999, 3.9146, 3.8490, 4.1715, 3.9676, 4.0928, 3.6956,
         3.9751, 4.1215, 4.2

### Check memorization on single layer

In [22]:
d,h,l = 1024, 4, 3

layer = MeMoLayer(d, h)
layer

MeMoLayer(
  (W_v_single_head): ProjectionTokens(in_features=1024, out_features=256)
  (Prj): ProjectionSequence((trasposed wrt saved one) in_features=4096, out_features=1024)
  (CMM): CorrelationMatrixMemory(in_features=1024, out_features=1024)
)

In [23]:
batch_size, current_length, d = input_embeddings.shape
batch_size, current_length, d 

(52, 12, 1024)

In [24]:
output_symbols.shape

torch.Size([52, 12, 1024])

In [25]:
current_length = int(input_embeddings.shape[1]/ h)

input_sequence = input_embeddings.reshape((batch_size, current_length, h, d))

current_output_symbols = output_symbols[:, [(x+1)*h-1 for x in range(0,current_length)]]
j = 2 
print(sum(sum(input_sequence[0][j] == input_embeddings[0][4*j:4*(j+1)])), input_sequence[0][j].shape)

(batch_size, blocks,h,d) = input_sequence.shape

tensor(4096) torch.Size([4, 1024])


In [26]:
input_sequence.shape, current_output_symbols.shape

(torch.Size([52, 3, 4, 1024]), torch.Size([52, 3, 1024]))

In [27]:
layer = MeMoLayer(d, h)
display(layer)
## update the input sequence for the next layer
_, seq_encoding_for_the_last_layer = layer.memorize(input_sequence, current_output_symbols, is_last=False)
layer.directly_memorize(seq_encoding_for_the_last_layer)

MeMoLayer(
  (W_v_single_head): ProjectionTokens(in_features=1024, out_features=256)
  (Prj): ProjectionSequence((trasposed wrt saved one) in_features=4096, out_features=1024)
  (CMM): CorrelationMatrixMemory(in_features=1024, out_features=1024)
)

In [28]:
input_sequence.shape, input_sequence[3].shape # batch (52 elements of chunks 4*4*1024)

(torch.Size([52, 3, 4, 1024]), torch.Size([3, 4, 1024]))

In [29]:
_, seq_encoding_for_the_last_layer = layer.retrieve(input_sequence)

print(seq_encoding_for_the_last_layer.shape)

torch.Size([52, 1024])


In [30]:
logits = layer.directly_retrieve(seq_encoding_for_the_last_layer)

In [31]:
retreived_output_symbol_vector, m = embedding.decode(logits)
print(retreived_output_symbol_vector, m)

tensor([48505, 20110,  1108, 48019,    80, 19216,  9718,  1113,  4927,    66,
        19216,  1448,  2122, 41530,   187,  4172,   246,   659, 10986, 30975,
           80, 12931,   352, 14134,  2721,   258,  8830,    87,   826,    15,
        17532,   729, 26798, 41070,  6575,   299,   266,  3737, 20889,   287,
          512,   354,   250,   247,    70,   275, 16128,  2680,    74, 13679,
           15,   209]) tensor([0.7027, 1.0707, 0.9809, 1.0174, 0.8847, 1.1506, 1.0516, 0.9499, 1.0354,
        1.7528, 1.2026, 1.0802, 0.9133, 1.0406, 0.9373, 0.9744, 0.9165, 0.9665,
        1.0506, 1.0039, 0.9117, 0.9218, 0.8659, 0.9423, 1.2641, 0.9583, 1.0097,
        1.1661, 1.1091, 0.9234, 0.8966, 1.0579, 1.1814, 0.9961, 0.9950, 0.9661,
        1.0283, 1.1281, 0.8819, 0.9572, 0.9551, 1.0385, 0.9571, 1.0440, 1.0158,
        1.0891, 1.0594, 0.9931, 1.0877, 0.9315, 0.9413, 0.9924],
       grad_fn=<MaxBackward0>)


In [32]:
o = embedding.decode(current_output_symbols[:, -1])[0]
display(o)

print(sum(o == retreived_output_symbol_vector), 'over', retreived_output_symbol_vector.shape)

tensor([48505, 20110,  1108, 48019,    80, 19216,  9718,  1113,  4927,    66,
        19216,  1448,  2122, 41530,   187,  4172,   246,   659, 10986, 30975,
           80, 12931,   352, 14134,  2721,   258,  8830,    87,   826,    15,
        17532,   729, 26798, 41070,  6575,   299,   266,  3737, 20889,   287,
          512,   354,   250,   247,    70,   275, 16128,  2680,    74, 13679,
           15,   209])

tensor(52) over torch.Size([52])


In [33]:
#### test single block

In [34]:
print(input_sequence[3].shape, current_output_symbols[3].shape)
print(input_sequence[3][0].shape, current_output_symbols[3][0].shape)

torch.Size([3, 4, 1024]) torch.Size([3, 1024])
torch.Size([4, 1024]) torch.Size([1024])


In [35]:
total = 0
correct = 0 

for batch_index in range(len(memo_input['input_ids'])):
    #print("input ids", memo_input[0]['input_ids'][batch_index])
    #print()
    
    for i in range(len(current_output_symbols[batch_index])):
        #display(embedding.decode(input_sequence[batch_index][i]), embedding.decode(current_output_symbols[batch_index][i]))
        true = embedding.decode(current_output_symbols[batch_index][i])[0].item()
        
        _, seq_encoding_for_the_last_layer  = layer.retrieve(input_sequence[batch_index][i].unsqueeze(0).unsqueeze(0))
        
        retreived_output_symbol_vector, m = embedding.decode(layer.directly_retrieve(seq_encoding_for_the_last_layer))
        pred = retreived_output_symbol_vector.item()

        total += 1
        correct += pred == true

print(f"{correct}/{total}")

153/156


### Check with batch size of 1 and output probs

In [36]:
memo_input = tokenizer.get_text_batch_encoding(['this is a test for a very short short sequence of 12 tokens'])
input_ids, labels = memo_input['input_ids'], memo_input['labels']
print(input_ids, labels)

input_embeddings = embedding.encode(input_ids)
#print(input_embeddings.shape)

output_embeddings = embedding.encode(labels)
#print(output_embeddings.shape)


current_length = max_length

current_length = int(current_length/h)
input_sequence = input_embeddings.reshape((1, current_length, h, d))

output_symbols = output_embeddings[:, [(x+1)*h-1 for x in range(0,current_length)]] ## the output symbol is always the same tokem?
print(embedding.decode(output_symbols))
input_sequence.shape, output_symbols.shape

tensor([[2520,  310,  247, 1071,  323,  247, 1077, 2159, 2159, 3425,  273, 1249]]) tensor([[  310,   247,  1071,   323,   247,  1077,  2159,  2159,  3425,   273,
          1249, 21761]])
(tensor([[  323,  2159, 21761]]), tensor([[0.9773, 1.0502, 1.0602]]))


(torch.Size([1, 3, 4, 1024]), torch.Size([1, 3, 1024]))

In [37]:
embedding.decode(input_sequence)[0], embedding.decode(output_symbols)[0], input_ids

(tensor([[[2520,  310,  247, 1071],
          [ 323,  247, 1077, 2159],
          [2159, 3425,  273, 1249]]]),
 tensor([[  323,  2159, 21761]]),
 tensor([[2520,  310,  247, 1071,  323,  247, 1077, 2159, 2159, 3425,  273, 1249]]))

In [38]:
input_sequence.shape

torch.Size([1, 3, 4, 1024])

In [39]:
output_symbols.shape

torch.Size([1, 3, 1024])

In [40]:
layer = MeMoLayer(d, h)
display(layer)

## update the input sequence for the next layer
_, seq_encoding_for_the_last_layer = layer.memorize(input_sequence, output_symbols, is_last=False)
layer.directly_memorize(seq_encoding_for_the_last_layer)

for i in range(0,3):
    _, seq_encoding_for_the_last_layer = layer.retrieve(input_sequence[0][i].unsqueeze(0).unsqueeze(0))
    print(seq_encoding_for_the_last_layer.shape)
                                                        
    retreived_output_symbol_vector, m = embedding.decode(layer.directly_retrieve(seq_encoding_for_the_last_layer))
    print(retreived_output_symbol_vector, m)

MeMoLayer(
  (W_v_single_head): ProjectionTokens(in_features=1024, out_features=256)
  (Prj): ProjectionSequence((trasposed wrt saved one) in_features=4096, out_features=1024)
  (CMM): CorrelationMatrixMemory(in_features=1024, out_features=1024)
)

torch.Size([1, 1024])
tensor([323]) tensor([0.9535], grad_fn=<MaxBackward0>)
torch.Size([1, 1024])
tensor([2159]) tensor([1.1330], grad_fn=<MaxBackward0>)
torch.Size([1, 1024])
tensor([21761]) tensor([1.0683], grad_fn=<MaxBackward0>)


## Test the entire MeMo model

In [41]:
from MeMoPyTorch.modelling_memo import MeMo

In [42]:
from MeMoPyTorch.modelling_memo_tokenizer import MeMoTokenizer

In [43]:
with open("testo_di_prova.txt") as my_first_text_f:
    my_first_text = my_first_text_f.read()

In [44]:
max_length = 384
print(max_length, h)
tokenizer = MeMoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b", 
                                          padding_side='left', truncation_side='left', 
                                          max_length=max_length, head_number=h)

384 4


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPTNeoXTokenizer'. 
The class this function is called from is 'MeMoTokenizer'.


Setting pad token and pad token id = <|endoftext|>, 0


In [45]:
memo_input = tokenizer.get_text_batch_encoding(my_first_text)
memo_input['labels'].shape

torch.Size([2, 384])

In [46]:
device='cuda:0' #'cpu'

In [47]:
d,h,l

(1024, 4, 3)

In [48]:
model = MeMo(inner_dim=d, 
             num_of_heads=h, 
             num_of_layers=3, 
             chunk_length=max_length, 
             num_embeddings=tokenizer.vocab_size, 
             padding_idx=tokenizer.pad_token_id, 
             device=device) #MeMoModel
model

MeMo embedding initilialization


MeMo(
  (encoder): MeMoEmbedding(50254, 1024, padding_idx=0)
  (layers): ModuleList(
    (0-2): 3 x MeMoLayer(
      (W_v_single_head): ProjectionTokens(in_features=1024, out_features=256)
      (Prj): ProjectionSequence((trasposed wrt saved one) in_features=4096, out_features=1024)
      (CMM): CorrelationMatrixMemory(in_features=1024, out_features=1024)
    )
  )
)

In [49]:
input_sequence =  model.encoder.encode(memo_input['input_ids'])
output_symbols = model.encoder.encode(memo_input['labels'])

(batch_size, current_length, d) = input_sequence.shape
last_layer = model.layers[model.l-1]

current_length = model.chunk_length

input_sequence.shape, output_symbols.shape, current_length

(torch.Size([2, 384, 1024]), torch.Size([2, 384, 1024]), 384)

In [50]:
layer_level = 0
input_index = [[j for j in range(i - model.h ** (layer_level + 1), i, model.h ** ((layer_level + 1) - 1))] 
               for i in range(model.h ** (layer_level + 1), current_length + 1)]
input_index[0:5]

[[0, 1, 2, 3], [1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6], [4, 5, 6, 7]]

In [51]:
original_input_seq = torch.clone(input_sequence)
original_out = torch.clone(output_symbols)

original_input_seq.shape, original_out.shape

(torch.Size([2, 384, 1024]), torch.Size([2, 384, 1024]))

In [52]:
#for layer_level in range(model.l):
layer_level = 0 #1,2
          
print(model.h ** (layer_level + 1) < current_length + 1)
## update the input sequence for the next layer
layer_output_idxs = [i - model.h ** ((layer_level + 1) - 1) for i in range(model.h ** (layer_level + 1), current_length + 1)]
output_symbols = output_symbols[:, layer_output_idxs]
print(output_symbols.shape)

input_index = [[j for j in range(i - model.h ** (layer_level + 1), i, model.h ** ((layer_level + 1) - 1))] 
               for i in range(model.h ** (layer_level + 1), current_length + 1)]


input_sequence = input_sequence[:, input_index]
print(input_sequence.shape)

True
torch.Size([2, 381, 1024])
torch.Size([2, 381, 4, 1024])


In [53]:
model.encoder.decode(output_symbols[0])[0] == model.encoder.decode(torch.stack([original_out[0][i] for i in layer_output_idxs]))[0]

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, Tr

In [54]:
import tqdm


class Evaluation:
    def check_memorization(self, model, tokenizer, text, # device='cpu',
                           starting_point=None):
        if starting_point == None:
            basic_block = model.h ** model.l
        else:
            basic_block = starting_point
        
        
        input_ = tokenizer(my_first_text, padding='longest', truncation='do_not_truncate', max_length=None)
        input_ = tokenizer.pad(input_, pad_to_multiple_of=basic_block)
        input_ids = input_['input_ids']
                
        count = 0
        correct = 0
        max_length = tokenizer.max_length
        (batch_size, number_of_tokens) = input_ids.shape

        #print(f"(batch_size, number_of_tokens) = {(batch_size, number_of_tokens)}")
        
        for i in tqdm.tqdm(range(basic_block,  number_of_tokens - 1)):
            text_tokens = input_ids[:, i - basic_block:i]
            
            (batch_size, number_of_tokens) = text_tokens.shape
            
            text_tokens = torch.concat((torch.zeros((batch_size, max_length-1-number_of_tokens), 
                                                    dtype=torch.int), 
                                        text_tokens), axis=1
                                      )
            
            #print(i - basic_block, i)
            out, max_value = model.retrieve(text_tokens)
            #print(out, input_ids[:, i])
            #print(out[0].item())
            
            count += batch_size
            correct += torch.sum(out.to('cpu') == input_ids[:, i])
        
                           
        return correct / count

    def check_pretokenized(self, model, tokenizer, input_ids,# device='cpu',
                           starting_point=None):
        if starting_point == None:
            basic_block = model.h ** model.l
        else:
            basic_block = starting_point
                
        count = 0
        correct = 0
        max_length = tokenizer.max_length
        (batch_size, number_of_tokens) = input_ids.shape

        #print(f"(batch_size, number_of_tokens) = {(batch_size, number_of_tokens)}")
        
        for i in tqdm.tqdm(range(basic_block,  number_of_tokens - 1)):
            text_tokens = input_ids[:, i - basic_block:i]
            
            (batch_size, number_of_tokens) = text_tokens.shape
            
            text_tokens = torch.concat((torch.zeros((batch_size, max_length-1-number_of_tokens), 
                                                    dtype=torch.int), 
                                        text_tokens), axis=1
                                      )
            
            #print(i - basic_block, i)
            out, max_value = model.retrieve(text_tokens)
            #print(out, input_ids[:, i])
            #print(out[0].item())
            
            count += batch_size
            correct += torch.sum(out.to('cpu') == input_ids[:, i])
        
                           
        return correct / count
        

In [55]:
model = MeMo(inner_dim=d, 
             num_of_heads=h, 
             num_of_layers=l, 
             chunk_length=max_length, 
             num_embeddings=tokenizer.vocab_size, 
             padding_idx=tokenizer.pad_token_id, 
             device=device)

memo_input = tokenizer.get_text_batch_encoding([my_first_text]*8)


memo_input['input_ids'].shape

MeMo embedding initilialization


torch.Size([16, 384])

In [56]:
model.memorize_text(memo_input)

In [57]:
e = Evaluation()
out = e.check_pretokenized(model, tokenizer, memo_input['input_ids'])
print("Degree of memorization: %f ", out)

100%|██████████| 319/319 [00:00<00:00, 737.06it/s]

Degree of memorization: %f  tensor(0.8072)





In [58]:
model.forget_text(memo_input)

In [59]:
out = e.check_pretokenized(model, tokenizer, memo_input['input_ids'])
print("Degree of memorization: %f ", out)

100%|██████████| 319/319 [00:00<00:00, 764.43it/s]

Degree of memorization: %f  tensor(0.0815)





In [60]:
model = MeMo(inner_dim=d, 
             num_of_heads=h, 
             num_of_layers=l, 
             chunk_length=max_length, 
             num_embeddings=tokenizer.vocab_size, 
             padding_idx=tokenizer.pad_token_id, 
             device=device)
print("CMM pre learning")
display(model.layers[0].CMM.weight)


bs = 8
for b in range(bs):
    memo_input = tokenizer.get_text_batch_encoding(my_first_text)
    print(memo_input['input_ids'].shape)

    model.memorize_text(memo_input)

Prj = model.layers[0].Prj.weight.detach().cpu()
CMM = model.layers[0].CMM.weight.detach().cpu()

display(Prj.T @ Prj)
display(CMM)

e = Evaluation()
out = e.check_pretokenized(model, tokenizer, memo_input['input_ids'])
print("Degree of memorization: %f ", out)


model.forget_text(memo_input)

out = e.check_pretokenized(model, tokenizer, memo_input['input_ids'])
print("Degree of memorization: %f ", out)

MeMo embedding initilialization
CMM pre learning


Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0', requires_grad=True)

torch.Size([2, 384])
torch.Size([2, 384])
torch.Size([2, 384])
torch.Size([2, 384])
torch.Size([2, 384])
torch.Size([2, 384])
torch.Size([2, 384])
torch.Size([2, 384])


tensor([[ 1.0101e+00,  1.2280e-02,  1.7227e-02,  ..., -1.2366e-02,
          1.4448e-02, -1.8096e-02],
        [ 1.2280e-02,  9.7455e-01,  1.1633e-02,  ...,  5.7634e-03,
          3.4262e-02,  2.4340e-02],
        [ 1.7227e-02,  1.1633e-02,  9.8092e-01,  ...,  2.9026e-02,
          2.8689e-03, -4.5811e-03],
        ...,
        [-1.2366e-02,  5.7634e-03,  2.9026e-02,  ...,  9.8428e-01,
          9.1292e-04,  1.4481e-02],
        [ 1.4448e-02,  3.4262e-02,  2.8689e-03,  ...,  9.1292e-04,
          9.8084e-01,  5.4475e-03],
        [-1.8096e-02,  2.4340e-02, -4.5811e-03,  ...,  1.4481e-02,
          5.4475e-03,  1.0217e+00]])

tensor([[ 1.7443e-02, -1.2456e-02, -5.1137e-03,  ...,  1.0043e-02,
          1.6821e-02, -1.0690e-05],
        [ 2.0084e-02,  2.4817e-02,  2.6959e-02,  ...,  9.1169e-03,
         -2.1050e-02, -8.9389e-03],
        [-4.9506e-03,  6.9840e-03, -2.0625e-03,  ..., -8.0438e-03,
          3.0667e-03,  8.5841e-03],
        ...,
        [ 4.6615e-03,  7.4334e-03,  1.6063e-02,  ..., -2.8373e-02,
         -7.0026e-03,  1.4033e-02],
        [-7.4981e-03,  1.0286e-02,  1.4566e-02,  ...,  2.8840e-03,
         -7.8137e-03,  2.1140e-02],
        [-5.8409e-03,  2.4287e-03,  1.1041e-02,  ...,  2.5658e-02,
         -6.4533e-03,  3.8758e-03]])

100%|██████████| 319/319 [00:00<00:00, 796.28it/s]


Degree of memorization: %f  tensor(0.9514)


100%|██████████| 319/319 [00:00<00:00, 743.09it/s]

Degree of memorization: %f  tensor(0.9514)





In [63]:
model = MeMo(inner_dim=d, 
             num_of_heads=h, 
             num_of_layers=l, 
             chunk_length=max_length, 
             num_embeddings=tokenizer.vocab_size, 
             padding_idx=tokenizer.pad_token_id, 
             device=device)
print("CMM pre learning")
display(model.layers[0].CMM.weight)


bs = 8
for b in range(bs):
    print(f"memorizing the same text iteration = {b}")
    memo_input = tokenizer.get_text_batch_encoding(my_first_text)
    model.memorize_text(memo_input)

Prj = model.layers[0].Prj.weight.detach().cpu()
CMM = model.layers[0].CMM.weight.detach().cpu()

display(Prj.T @ Prj)
display(CMM)

e = Evaluation()
out = e.check_pretokenized(model, tokenizer, memo_input['input_ids'])
print("Degree of memorization after memorizing: %f ", out)

for b in range(bs):
    print(f"forgetting the same text iteration = {b}")
    memo_input = tokenizer.get_text_batch_encoding(my_first_text)
    model.forget_text(memo_input)

out = e.check_pretokenized(model, tokenizer, memo_input['input_ids'])
print("Degree of memorization after forgetting: %f ", out)

MeMo embedding initilialization
CMM pre learning


Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0', requires_grad=True)

memorizing the same text iteration = 0
memorizing the same text iteration = 1
memorizing the same text iteration = 2
memorizing the same text iteration = 3
memorizing the same text iteration = 4
memorizing the same text iteration = 5
memorizing the same text iteration = 6
memorizing the same text iteration = 7


tensor([[ 1.0114e+00, -7.7352e-03,  3.1193e-02,  ...,  9.8616e-03,
         -5.1539e-03,  8.5160e-03],
        [-7.7352e-03,  1.0026e+00, -6.3145e-03,  ...,  1.2942e-02,
          6.2130e-03, -3.2290e-03],
        [ 3.1193e-02, -6.3145e-03,  9.9515e-01,  ...,  3.7192e-03,
          8.0864e-04, -3.9914e-02],
        ...,
        [ 9.8616e-03,  1.2942e-02,  3.7192e-03,  ...,  9.9843e-01,
         -1.8007e-02,  1.0587e-02],
        [-5.1539e-03,  6.2130e-03,  8.0864e-04,  ..., -1.8007e-02,
          9.6098e-01,  9.6622e-03],
        [ 8.5160e-03, -3.2290e-03, -3.9914e-02,  ...,  1.0587e-02,
          9.6622e-03,  9.9557e-01]])

tensor([[-0.0053, -0.0290, -0.0211,  ..., -0.0158,  0.0136,  0.0058],
        [-0.0051,  0.0091,  0.0132,  ...,  0.0086, -0.0140, -0.0399],
        [-0.0129, -0.0180, -0.0300,  ...,  0.0116,  0.0044,  0.0201],
        ...,
        [ 0.0466,  0.0165, -0.0146,  ..., -0.0134,  0.0170,  0.0188],
        [ 0.0155,  0.0005, -0.0193,  ..., -0.0363,  0.0022,  0.0193],
        [ 0.0015,  0.0067, -0.0050,  ..., -0.0334, -0.0207,  0.0009]])

100%|██████████| 319/319 [00:00<00:00, 622.56it/s]


Degree of memorization after memorizing: %f  tensor(0.9420)
forgetting the same text iteration = 0
forgetting the same text iteration = 1
forgetting the same text iteration = 2
forgetting the same text iteration = 3
forgetting the same text iteration = 4
forgetting the same text iteration = 5
forgetting the same text iteration = 6
forgetting the same text iteration = 7


100%|██████████| 319/319 [00:00<00:00, 494.24it/s]

Degree of memorization after forgetting: %f  tensor(0.0815)



