# Explore alternative tokenizers
Exploring possibilities of different NLP tokenizers for the BART model. 

In [5]:
import pandas as pd

In [17]:
# load data
df = pd.read_csv("./data/trial_set/1K.smi", delimiter=" ")

In [18]:
df

Unnamed: 0,0
0,Cc1nn(C)c(C)c1S(=O)(=O)N1CCN2C(=O)CN(C)C(=O)[C...


## WordPiece tokenizer
This tokenizer from the deepchem library looks to be better for anorganic molecules, has a lot of structures that are not that common in our SMILES. 

In [20]:
from deepchem.feat.smiles_tokenizer import SmilesTokenizer
import os

vocab_path = 'wp_tokenizer/vocab.txt'
tokenizer = SmilesTokenizer(vocab_path)
print(tokenizer.tokenize(df.iloc[90][0]))

Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up.
Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.


['C', 'O', 'c', '1', 'c', 'c', '2', 'c', 'c', 'c', '(', '=', 'O', ')', 'o', 'c', '2', 'c', '(', 'O', '[C@@H]', '2', 'O', '[C@@H]', '(', 'C', 'O', ')', '[C@H]', '(', 'O', ')', '[C@@H]', '(', 'O', ')', '[C@@H]', '2', 'O', ')', 'c', '1', 'O', '1', '6', '2', '1', '5', '1', '3', '2']


In [19]:
!pwd

/mnt/storage-brno6/home/ahajek/Spektro/MassGenie


## BBPE Tokenizer
Hopefully will learn to tokenize the SMILES into longer squences.

Let's train one..

In [18]:
# create a SMILES file without the zinc bullshit
with open("./tokenizer/training_data/1K.txt", "w+") as f:
    for smiles in df.smiles:
        f.write(smiles)

In [7]:
import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
from pathlib import Path
import glob

class BPE_token(object):
    def __init__(self, vocab_size=50257, min_frequency=10):
        self.vocab_size = vocab_size
        self.min_frequency = min_frequency
        self.tokenizer = Tokenizer(BPE())
        self.tokenizer.normalizer = Sequence([
            NFKC()
        ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, path):
        trainer = BpeTrainer(vocab_size=self.vocab_size, min_frequency=self.min_frequency, show_progress=True, initial_alphabet=ByteLevel.alphabet(), 
                             special_tokens=["<eos>",
                                             "<ukn>",
                                             "<pad>",
                                             "<bos>",
                                             "<neims>",
                                             "<nist>",
                                             "<rassp>",
                                             "<trafo>",
                                             "<source1>",
                                             "<source2>",
                                             "<source3>",])
        self.tokenizer.train(path, trainer)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

In [2]:
from tokenizers import Tokenizer

path = ["tokenizer/training_data/1M.txt"]
min_frequency = 300

for min_frequency in [500, 1000, 2000, 3000, 4000, 5000, 6000]:
    tokenizer = BPE_token(min_frequency=min_frequency)

    # train the tokenizer model
    tokenizer.bpe_train(path)

    # saving the tokenized data in our specified folder
    save_path = 'tokenizer/bbpe_tokenizer/' ####
    tokenizer.save_tokenizer(save_path)
    tokenizer.tokenizer.save(save_path + f"/bart_bbpe_tokenizer_1M_mf{min_frequency}.model")    #####




bart_bbpe_tokenizer_1M_mf500
- max vocab_size 50257
- min_frequency 500
- final vocab size 801 (including 11 special tokens)



bart_bbpe_tokenizer_1M_mf1000
- max vocab_size 50257
- min_frequency 1000
- final vocab size 801 (including 11 special tokens)



bart_bbpe_tokenizer_1M_mf2000
- max vocab_size 50257
- min_frequency 2000
- final vocab size 801 (including 11 special tokens)



bart_bbpe_tokenizer_1M_mf3000
- max vocab_size 50257
- min_frequency 3000
- final vocab size 801 (including 11 special tokens)



bart_bbpe_tokenizer_1M_mf4000
- max vocab_size 50257
- min_frequency 4000
- final vocab size 801 (including 11 special tokens)



bart_bbpe_tokenizer_1M_mf5000
- max vocab_size 50257
- min_frequency 5000
- final vocab size 801 (including 11 special tokens)



bart_bbpe_tokenizer_1M_mf6000
- max vocab_size 50257
- min_frequency 6000
- final vocab size 801 (including 11 special tokens)


In [4]:
from tokenizers import Tokenizer

save_path = 'tokenizer/bbpe_tokenizer/' ####
for min_frequency in [500, 1000, 2000, 3000, 4000, 5000, 6000]:
    # loading the saved tokenizer
    tokenizer = Tokenizer.from_file(save_path + f"/bart_bbpe_tokenizer_1M_mf{min_frequency}.model")

    print(f"bart_bbpe_tokenizer_1M_mf{min_frequency}\n" +
          "- max vocab_size 50257\n" + 
          f"- min_frequency {min_frequency}\n" +
          f"- final vocab size {len(tokenizer.get_vocab())} (including 11 special tokens)")

bart_bbpe_tokenizer_1M_mf500
- max vocab_size 50257
- min_frequency 500
- final vocab size 506 (including 11 special tokens)
bart_bbpe_tokenizer_1M_mf1000
- max vocab_size 50257
- min_frequency 1000
- final vocab size 454 (including 11 special tokens)
bart_bbpe_tokenizer_1M_mf2000
- max vocab_size 50257
- min_frequency 2000
- final vocab size 422 (including 11 special tokens)
bart_bbpe_tokenizer_1M_mf3000
- max vocab_size 50257
- min_frequency 3000
- final vocab size 401 (including 11 special tokens)
bart_bbpe_tokenizer_1M_mf4000
- max vocab_size 50257
- min_frequency 4000
- final vocab size 390 (including 11 special tokens)
bart_bbpe_tokenizer_1M_mf5000
- max vocab_size 50257
- min_frequency 5000
- final vocab size 376 (including 11 special tokens)
bart_bbpe_tokenizer_1M_mf6000
- max vocab_size 50257
- min_frequency 6000
- final vocab size 365 (including 11 special tokens)


542

### Try the BBPE out

In [19]:
from tokenizers import Tokenizer


# Initialize a tokenizer
# vocab = "./tokenizer/bbpe_tokenizer/vocab.json"
# merges = "./tokenizer/bbpe_tokenizer//merges.txt"
tok = "./tokenizer/bbpe_tokenizer/bart_bbpe_1M_tokenizer.model"
tokenizer = Tokenizer.from_file(tok)
# special_tokens_dict = {"bos_token": "<bos>", "unk_token": "<unk>", "eos_token": "<eos>", "sep_token": "<sep>"}
# special_tokens_dict = ["<sep>"]


# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

# And then encode:
encoded = tokenizer.encode("Cc1nn(C)c(C)c1S(=O)(=O)N1CCN2C(=O)CN(C)C(=O)")


In [None]:
df

In [60]:
print(encoded.tokens)

['Ġ', 'CS', '(=', 'O', ')(=', 'O', ')', 'NCC', '(=', 'O', ')', 'N', '[', 'C', '@', 'H', ']', '1', 'COCC', '[', 'C', '@@', 'H', ']', '1', 'Oc', '1', 'ccc', '(', 'C', '(', 'N', ')=', 'O', ')', 'cc', '1']


In [None]:
sorted(tokenizer.get_vocab())

In [None]:
bt = tokenizer.token_to_id("<bos>")
et = tokenizer.token_to_id("<eos>")
tok_smiles = [bt] + tokenizer.encode(df.smiles[0]).ids #+ [et] + (200-2-len(tokenizer.encode(df.smiles[0]))) * [pt]  

In [41]:
print(df.smiles[0])
tokenizer.encode(df.smiles[0]).ids

Cc1nn(C)c(C)c1S(=O)(=O)N1CCN2C(=O)CN(C)C(=O)[C@H]2C1


[224,
 278,
 20,
 282,
 11,
 38,
 12,
 70,
 11,
 38,
 12,
 70,
 20,
 54,
 260,
 50,
 270,
 50,
 12,
 49,
 20,
 267,
 21,
 38,
 260,
 50,
 12,
 266,
 11,
 38,
 12,
 38,
 260,
 50,
 263,
 38,
 35,
 43,
 64,
 21,
 38,
 20]

In [64]:
pd.read_pickle("data/trial_set/1K_bbpe_bart_prepared_data_train.pkl")

Unnamed: 0,destereo_smiles,input_ids,decoder_input_ids,encoder_attention_mask,decoder_attention_mask,labels,position_ids
629,COCCN1C(=O)C(=O)N(C1=O)CC(=O)c1c(N)n(C)c(=O)n(...,"[15, 28, 29, 30, 31, 32, 33, 39, 40, 41, 42, 4...","[3, 224, 325, 20, 38, 260, 50, 12, 38, 260, 50...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 325, 20, 38, 260, 50, 12, 38, 260, 50...","[3, 1, 2, 5, 5, 4, 4, 5, 6, 7, 8, 6, 7, 9, 6, ..."
338,O=C(NC1CCS(=O)(=O)C1)CCC(=O)NC1CCS(=O)(=O)C1,"[17, 18, 26, 27, 28, 29, 30, 31, 32, 33, 34, 3...","[3, 224, 50, 32, 38, 11, 272, 20, 290, 260, 50...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 50, 32, 38, 11, 272, 20, 290, 260, 50...","[1, 1, 3, 6, 6, 7, 4, 5, 2, 1, 1, 2, 4, 7, 6, ..."
620,CN(C(=O)c1c(C)nc2n1CCN(C2)C(=O)c1cc(=O)n(c(=O)...,"[30, 33, 34, 38, 39, 40, 41, 42, 43, 44, 45, 4...","[3, 224, 266, 11, 38, 260, 50, 12, 70, 20, 70,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 266, 11, 38, 260, 50, 12, 70, 20, 70,...","[2, 4, 0, 4, 7, 7, 7, 9, 7, 8, 6, 4, 4, 6, 6, ..."
396,Cn1ncc(c1)S(=O)(=O)N1CCN(CC1)S(=O)(=O)c1cnn(c1)C,"[33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 4...","[3, 224, 275, 20, 310, 11, 70, 20, 12, 54, 260...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 275, 20, 310, 11, 70, 20, 12, 54, 260...","[2, 3, 3, 1, 5, 6, 6, 6, 8, 6, 5, 3, 4, 4, 4, ..."
251,O=C(C1CNCC(C1)C(=O)N1CCOCC1)NCCc1nncn1C,"[33, 36, 39, 40, 41, 42, 43, 44, 45, 51, 52, 5...","[3, 224, 50, 32, 38, 11, 38, 20, 266, 261, 11,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 50, 32, 38, 11, 38, 20, 266, 261, 11,...","[4, 1, 8, 7, 9, 9, 8, 9, 2, 6, 8, 8, 8, 9, 9, ..."
...,...,...,...,...,...,...,...
308,OCCOCCNC(=O)C1CC(C(C1)O)NC(=O)c1nsnc1C,"[19, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 4...","[3, 224, 372, 291, 260, 50, 12, 38, 20, 261, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 372, 291, 260, 50, 12, 38, 20, 261, 1...","[3, 4, 3, 3, 5, 3, 4, 7, 6, 9, 9, 9, 9, 9, 7, ..."
674,COc1ccc(cc1C(=O)NC1COC2C1OCC2O)S(=O)(=O)N,"[26, 31, 33, 36, 38, 39, 40, 41, 42, 43, 44, 4...","[3, 224, 331, 20, 274, 11, 264, 20, 38, 260, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 331, 20, 274, 11, 264, 20, 38, 260, 5...","[0, 7, 2, 4, 1, 7, 4, 8, 8, 9, 8, 8, 7, 7, 6, ..."
631,O=C(Cn1ncn(c1=O)C)OCCCOC(=O)Cn1ncn(c1=O)C,"[29, 30, 31, 39, 40, 41, 42, 43, 44, 45, 52, 5...","[3, 224, 50, 32, 38, 11, 275, 20, 303, 11, 70,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 50, 32, 38, 11, 275, 20, 303, 11, 70,...","[3, 1, 3, 4, 6, 4, 8, 5, 6, 0, 2, 6, 6, 8, 8, ..."
526,OC(=O)CN(C(=O)C)CC1OCCN(C1)C(=O)CN1CSCC1=O,"[30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 4...","[3, 224, 280, 260, 50, 12, 266, 11, 38, 260, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 280, 260, 50, 12, 266, 11, 38, 260, 5...","[3, 0, 0, 3, 2, 1, 4, 1, 6, 6, 8, 9, 9, 9, 8, ..."


### Inspect the tokenizers a bit

In [32]:
from tokenizers import Tokenizer

def get_num_tokens(tokenizer):
    return len(tokenizer.get_vocab())

def get_mean_vocab_len(tokenizer):
    vocab = tokenizer.get_vocab()
    mean_len = sum(map(lambda x: len(x), vocab.keys())) / len(vocab)
    return mean_len

In [17]:
tokenizer3 = Tokenizer.from_file("tokenizer/bbpe_tokenizer/bart_bbpe_1M_tokenizer.model")
tokenizer30 = Tokenizer.from_file("tokenizer/bbpe_tokenizer/bart_bbpe_tokenizer_1M_mf30.model")
tokenizer3000 = Tokenizer.from_file("tokenizer/bbpe_tokenizer/bart_bbpe_tokenizer_1M_mf3000.model")

In [33]:
print(get_num_tokens(tokenizer3), get_mean_vocab_len(tokenizer3))
print(get_num_tokens(tokenizer30), get_mean_vocab_len(tokenizer30))
get_num_tokens(tokenizer3000), get_mean_vocab_len(tokenizer3000)

1240 4.661290322580645
756 3.427248677248677


(401, 1.940149625935162)