# MeMo HF
Version integrated with Transformer Libraries (Version 0.4)

In [1]:
import torch
from MeMoHF.modelling_memo_tokenizer import MeMoTokenizer
from MeMoHF.modelling_memo_configuration import MeMoConfig
from MeMoHF.modelling_memo import MeMoForCausalLM
from MeMoHF.evaluating_memo import Evaluation

In [2]:
max_length = 12 
tokenizer = MeMoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b", 
                                          truncation_side = 'left',
                                          padding_side='left', max_length=max_length, head_number=4)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPTNeoXTokenizer'. 
The class this function is called from is 'MeMoTokenizer'.


Setting pad token and pad token id = <|endoftext|>, 0


In [3]:
with open("testo_di_prova.txt") as my_first_text_f:
    my_first_text = my_first_text_f.read()

token_ids = tokenizer.encode(my_first_text)#, return_tensors='pt')
print(token_ids) # return max len + 1 

(tensor([[18886,   256, 36144,  4164,  1809,    80,  1448,   295,   532,  1584,
            13, 50190]]), tensor([[  256, 36144,  4164,  1809,    80,  1448,   295,   532,  1584,    13,
         50190,    15]]))


In [4]:
memo_input = tokenizer.get_text_batch_encoding([my_first_text, my_first_text[0:10]])
memo_input.keys(), memo_input['input_ids'].shape

(dict_keys(['input_ids', 'labels']), torch.Size([52, 12]))

In [5]:
for i in range(3):
    print(tokenizer.decode(memo_input['input_ids'][i]))
    print(tokenizer.decode(memo_input['labels'][i]))
    print()

<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Cosimo di
<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Cosimo di Giovanni

 de' Medici detto il Vecchio o Pater
' Medici detto il Vecchio o Pater patri

æ (Firenze, 27 settembre 1389
 (Firenze, 27 settembre 1389 –



Check memorization on single layer

In [6]:
memo_input = tokenizer.get_text_batch_encoding([my_first_text, my_first_text[10:30]])

memo_input['input_ids'].shape

torch.Size([52, 12])

In [7]:
from MeMoHF.modelling_memo_embedding import MeMoEmbedding

In [8]:
d,h,l = 1024, 4, 3

In [9]:
embedding = MeMoEmbedding(
    num_embeddings=tokenizer.vocab_size,
    embedding_dim=d,
    padding_idx=tokenizer.pad_token_id, #0
    _freeze=True
)

MeMo embedding initilialization


In [10]:
input_embeddings = embedding.encode(memo_input['input_ids'])
output_symbols = embedding.encode(memo_input['labels'])

input_embeddings.shape, output_symbols.shape

(torch.Size([52, 12, 1024]), torch.Size([52, 12, 1024]))

In [11]:
input_tokens_ids = tokenizer(['Test', 'Un altro Test'])['input_ids']
print(input_tokens_ids)

input_embeddings = embedding.forward(input_tokens_ids)
input_embeddings

tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         5089],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0, 2447, 6945,  287,
         6004]])


tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0130,  0.0025, -0.0328,  ..., -0.0017, -0.0307,  0.0098]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [-0.0214, -0.0625,  0.0228,  ...,  0.0210,  0.0681, -0.0045],
         [-0.0002, -0.0390,  0.0355,  ...,  0.0277, -0.0276,  0.0295],
         [ 0.0354, -0.0010,  0.0389,  ..., -0.0259, -0.0141, -0.0456]]])

In [12]:
from MeMoHF.modelling_memo_layer import MeMoLayer

In [13]:
layer = MeMoLayer(d, h)
layer

MeMoLayer(
  (W_v_single_head): ProjectionTokens(in_features=1024, out_features=256)
  (Prj): ProjectionSequence((trasposed wrt saved one) in_features=4096, out_features=1024)
  (CMM): CorrelationMatrixMemory(in_features=1024, out_features=1024)
)

Memo: Initializing the Tokenizer and the model

In [14]:
# Meta Parameters : 
#    d - inner dimension
#    h - number of heads
#    l - number of layers
d,h,l = 1024, 4, 4
chunk_length = 1024

# Initializing a standard Tokenizer
max_length = chunk_length 
tokenizer = MeMoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b", 
                                          padding_side='left', truncation_side='left', 
                                          max_length=max_length, head_number=h)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.pad_token_id

# Intializing Memo Configuration
config = MeMoConfig(vocab_size=tokenizer.vocab_size, 
               hidden_size=d, 
               num_hidden_layers=l,
               num_attention_heads=h,
               chunk_length=chunk_length,
               bos_token_id=tokenizer.bos_token_id,
               eos_token_id=tokenizer.eos_token_id,
               pad_token_id=tokenizer.pad_token_id,
              )

# Initializing the Memo Model from the configuration

model = MeMoForCausalLM(config) 

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
    model.to('cuda')


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPTNeoXTokenizer'. 
The class this function is called from is 'MeMoTokenizer'.


Setting pad token and pad token id = <|endoftext|>, 0
MeMo embedding initilialization
GPU: NVIDIA RTX A6000 is available.


Reading the two texts

In [15]:
with open("testo_di_prova.txt") as my_first_text_f:
    my_first_text = my_first_text_f.read()
with open("testo_di_prova2.txt") as my_first_text_f:
    my_second_text = my_first_text_f.read()



Memorizing the first text and evaluating if it is memorized

In [16]:
memo_input_1 = tokenizer.get_text_batch_encoding([my_first_text]*8)  # Writing the same doc 8 times to stress the memorization with batch
memo_input_2 = tokenizer.get_text_batch_encoding([my_second_text]*8) # Writing the same doc 8 times to stress the memorization with batch

model.memorize_text(memo_input_1)
e = Evaluation()

e1 = e.check_pretokenized(model, tokenizer, memo_input_1['input_ids'], starting_point=8)
e2 = e.check_pretokenized(model, tokenizer, memo_input_2['input_ids'], starting_point=8)

print("Memorization level of first text  : ", e1) 
print("Memorization level of second text : ", e2) 

100%|██████████| 1015/1015 [00:01<00:00, 588.20it/s]
100%|██████████| 1015/1015 [00:02<00:00, 448.53it/s]

Memorization level of first text  :  tensor(0.7025)
Memorization level of second text :  tensor(0.0224)





Memorizing the second text and checking if it affected the memorization of the first text

In [17]:
model.memorize_text(memo_input_2)

e1 = e.check_pretokenized(model, tokenizer, memo_input_1['input_ids'], starting_point=8)
e2 = e.check_pretokenized(model, tokenizer, memo_input_2['input_ids'], starting_point=8)

print("Memorization level of first text  : ", e1) 
print("Memorization level of second text : ", e2) 

100%|██████████| 1015/1015 [00:01<00:00, 597.13it/s]
100%|██████████| 1015/1015 [00:02<00:00, 432.90it/s]

Memorization level of first text  :  tensor(0.3589)
Memorization level of second text :  tensor(0.2348)





Forgetting the first document

In [18]:
model.forget_text(memo_input_2)

Checking the effect on the two texts

In [19]:
e1 = e.check_pretokenized(model, tokenizer, memo_input_1['input_ids'], starting_point=8)
e2 = e.check_pretokenized(model, tokenizer, memo_input_2['input_ids'], starting_point=8)

print("Memorization level of first text  : ", e1) 
print("Memorization level of second text : ", e2) 

100%|██████████| 1015/1015 [00:02<00:00, 499.64it/s]
100%|██████████| 1015/1015 [00:02<00:00, 433.55it/s]

Memorization level of first text  :  tensor(0.6979)
Memorization level of second text :  tensor(0.0208)





In [21]:
model

MeMoForCausalLM(
  (memo): MeMo(
    (encoder): MeMoEmbedding(50254, 1024, padding_idx=0)
    (layers): MeMoLayers(
      (0-3): 4 x MeMoLayer(
        (W_v_single_head): ProjectionTokens(in_features=1024, out_features=256)
        (Prj): ProjectionSequence((trasposed wrt saved one) in_features=4096, out_features=1024)
        (CMM): CorrelationMatrixMemory(in_features=1024, out_features=1024)
      )
    )
  )
  (lm_head): MeMoEmbedding(50254, 1024, padding_idx=0)
)

In [22]:
config = MeMoConfig(vocab_size=tokenizer.vocab_size, 
               hidden_size=d, 
               num_hidden_layers=l,
               num_attention_heads=h,
               chunk_length=chunk_length,
               bos_token_id=tokenizer.bos_token_id,
               eos_token_id=tokenizer.eos_token_id,
               pad_token_id=tokenizer.pad_token_id,
              )

# Initializing the Memo Model from the configuration

model = MeMoForCausalLM(config)
print("CMM pre learning")
display(model.memo.layers[0].CMM.weight)


bs = 8
for b in range(bs):
    print(f"memorizing the same text iteration = {b}")
    memo_input = tokenizer.get_text_batch_encoding(my_first_text)
    model.memorize_text(memo_input)

Prj = model.memo.layers[0].Prj.weight.detach().cpu()
CMM = model.memo.layers[0].CMM.weight.detach().cpu()

display(Prj.T @ Prj)
display(CMM)

e = Evaluation()
out = e.check_pretokenized(model, tokenizer, memo_input['input_ids'])
print("Degree of memorization after memorizing: %f ", out)

for b in range(bs):
    print(f"forgetting the same text iteration = {b}")
    memo_input = tokenizer.get_text_batch_encoding(my_first_text)
    model.forget_text(memo_input)

out = e.check_pretokenized(model, tokenizer, memo_input['input_ids'])
print("Degree of memorization after forgetting: %f ", out)

MeMo embedding initilialization
CMM pre learning


Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)

memorizing the same text iteration = 0
memorizing the same text iteration = 1
memorizing the same text iteration = 2
memorizing the same text iteration = 3
memorizing the same text iteration = 4
memorizing the same text iteration = 5
memorizing the same text iteration = 6
memorizing the same text iteration = 7


tensor([[ 9.8685e-01,  2.3611e-03,  5.7158e-03,  ..., -2.0955e-02,
         -7.8387e-03, -3.0100e-02],
        [ 2.3611e-03,  1.0030e+00, -4.8353e-03,  ..., -6.8917e-03,
          3.6630e-03,  4.4900e-03],
        [ 5.7158e-03, -4.8353e-03,  1.0183e+00,  ...,  1.7557e-02,
         -5.1791e-05,  1.4415e-02],
        ...,
        [-2.0955e-02, -6.8917e-03,  1.7557e-02,  ...,  1.0093e+00,
          3.6133e-03, -1.3998e-02],
        [-7.8387e-03,  3.6630e-03, -5.1791e-05,  ...,  3.6133e-03,
          9.9020e-01, -2.5500e-02],
        [-3.0100e-02,  4.4900e-03,  1.4415e-02,  ..., -1.3998e-02,
         -2.5500e-02,  1.0135e+00]])

tensor([[-0.0238,  0.0085, -0.0166,  ...,  0.0067,  0.0305, -0.0151],
        [-0.0216,  0.0119, -0.0180,  ...,  0.0093,  0.0123,  0.0514],
        [-0.0219,  0.0268,  0.0105,  ..., -0.0112,  0.0089, -0.0074],
        ...,
        [ 0.0114, -0.0462,  0.0281,  ...,  0.0463, -0.0177, -0.0195],
        [-0.0349,  0.0236,  0.0352,  ...,  0.0020,  0.0098,  0.0043],
        [ 0.0151, -0.0149, -0.0279,  ..., -0.0244,  0.0027, -0.0331]])

100%|██████████| 767/767 [00:06<00:00, 122.05it/s]


Degree of memorization after memorizing: %f  tensor(0.9156)
forgetting the same text iteration = 0
forgetting the same text iteration = 1
forgetting the same text iteration = 2
forgetting the same text iteration = 3
forgetting the same text iteration = 4
forgetting the same text iteration = 5
forgetting the same text iteration = 6
forgetting the same text iteration = 7


100%|██████████| 767/767 [00:05<00:00, 144.28it/s]

Degree of memorization after forgetting: %f  tensor(0.)



