# MeMo HF
Version integrated with Transformer Libraries (Version 0.4)

In [1]:
import torch
from MeMoHF.modelling_memo_tokenizer import MeMoTokenizer
from MeMoHF.modelling_memo_configuration import MeMoConfig
from MeMoHF.modelling_memo import MeMoForCausalLM
from MeMoHF.evaluating_memo import Evaluation

Memo: Initializing the Tokenizer and the model

In [2]:
# Meta Parameters : 
#    d - inner dimension
#    h - number of heads
#    l - number of layers
d,h,l = 2048, 8, 3
chunk_length = 4096

# Initializing a standard Tokenizer
max_length = chunk_length 
tokenizer = MeMoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b", 
                                          padding_side='left', truncation_side='left', 
                                          max_length=max_length, head_number=h)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.pad_token_id

# Intializing Memo Configuration
config = MeMoConfig(vocab_size=tokenizer.vocab_size, 
               hidden_size=d, 
               num_hidden_layers=l,
               num_attention_heads=h,
               chunk_length=chunk_length,
               bos_token_id=tokenizer.bos_token_id,
               eos_token_id=tokenizer.eos_token_id,
               pad_token_id=tokenizer.pad_token_id,
              )

# Initializing the Memo Model from the configuration

model = MeMoForCausalLM(config) 

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
    model.to('cuda')


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPTNeoXTokenizer'. 
The class this function is called from is 'MeMoTokenizer'.


Setting pad token and pad token id = <|endoftext|>, 0
MeMo embedding initilialization
GPU: NVIDIA RTX A6000 is available.


Reading the two texts

In [3]:
with open("testo_di_prova.txt",encoding='utf8') as my_first_text_f:
    my_first_text = my_first_text_f.read()
with open("testo_di_prova2.txt",encoding='utf8') as my_first_text_f:
    my_second_text = my_first_text_f.read()



Memorizing the first text and evaluating if it is memorized

In [4]:
memo_input_1 = tokenizer.get_text_batch_encoding([my_first_text]*1)  # Writing the same doc 8 times to stress the memorization with batch
memo_input_2 = tokenizer.get_text_batch_encoding([my_second_text]*1) # Writing the same doc 8 times to stress the memorization with batch

model.memorize_text(memo_input_1)
e = Evaluation()

e1 = e.check_pretokenized(model, tokenizer, memo_input_1['input_ids'][:,0,:], starting_point=8)
e2 = e.check_pretokenized(model, tokenizer, memo_input_2['input_ids'][:,0,:], starting_point=8)

print("Memorization level of first text  : ", e1) 
print("Memorization level of second text : ", e2) 

100%|██████████| 4087/4087 [00:22<00:00, 184.60it/s]
100%|██████████| 4087/4087 [00:22<00:00, 183.07it/s]

Memorization level of first text  :  tensor(0.9907)
Memorization level of second text :  tensor(0.0334)





Memorizing the second text and checking if it affected the memorization of the first text

In [5]:
model.memorize_text(memo_input_2)

e1 = e.check_pretokenized(model, tokenizer, memo_input_1['input_ids'][:,0,:], starting_point=8)
e2 = e.check_pretokenized(model, tokenizer, memo_input_2['input_ids'][:,0,:], starting_point=8)

print("Memorization level of first text  : ", e1) 
print("Memorization level of second text : ", e2) 

100%|██████████| 4087/4087 [00:22<00:00, 181.28it/s]
100%|██████████| 4087/4087 [00:22<00:00, 179.51it/s]

Memorization level of first text  :  tensor(0.9722)
Memorization level of second text :  tensor(0.9785)





Forgetting the first document

In [6]:
model.forget_text(memo_input_2)

Checking the effect on the two texts

In [7]:
e1 = e.check_pretokenized(model, tokenizer, memo_input_1['input_ids'][:,0,:], starting_point=8)
e2 = e.check_pretokenized(model, tokenizer, memo_input_2['input_ids'][:,0,:], starting_point=8)

print("Memorization level of first text  : ", e1) 
print("Memorization level of second text : ", e2) 

100%|██████████| 4087/4087 [00:22<00:00, 179.19it/s]
100%|██████████| 4087/4087 [00:22<00:00, 178.36it/s]

Memorization level of first text  :  tensor(0.9907)
Memorization level of second text :  tensor(0.0350)





In [8]:
memo_input_1['input_ids'].shape

torch.Size([8, 1, 4096])