# SMILES tokenizer
The goal of this notebook is to prepare training data and train a suitable SMILES tokenizer. We also look at already trained WordPiece tokenizer from the `deepchem` library.

## WordPiece tokenizer
Result: This tokenizer from the `deepchem` library looks to be better for anorganic molecules, has a lot of structures and groups that are not that present in our SMILES strings. 

In [20]:
from deepchem.feat.smiles_tokenizer import SmilesTokenizer
import os

vocab_path = 'wp_tokenizer/vocab.txt'
tokenizer = SmilesTokenizer(vocab_path)
print(tokenizer.tokenize(df.iloc[90][0]))

Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up.
Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.


['C', 'O', 'c', '1', 'c', 'c', '2', 'c', 'c', 'c', '(', '=', 'O', ')', 'o', 'c', '2', 'c', '(', 'O', '[C@@H]', '2', 'O', '[C@@H]', '(', 'C', 'O', ')', '[C@H]', '(', 'O', ')', '[C@@H]', '(', 'O', ')', '[C@@H]', '2', 'O', ')', 'c', '1', 'O', '1', '6', '2', '1', '5', '1', '3', '2']


## BBPE Tokenizer
Next we get inspired by the generative NLP approaches and try to train a BBPE tokenizer. The main advantage of this tokenization technique is, that since the training starts with a vocabulary with all on-byte ASCII char, we can be sure to tokenize any SMILES string without encountering an unknown token (for compatibility reasons we add it anyway though).

### Data preparation
As training data we take a 1M random slice from the 30M dataset we scraped from ZINC15 database. The SMILES are already deduplicated and canonical.

In [11]:
import numpy as np
from pathlib import Path

np.random.seed(42)

data_path = "../data/datasets/ZINC15/30M_slice/30M.smi"
slice_save_dir = "../tokenizer/training_data"

def random_slice(size, data_path, slice_save_path):
    with open(data_path, 'r') as f:
        data = np.array(f.read().splitlines())
        choice = np.random.choice(data, size, replace=False)

    with open(slice_save_path, 'w') as f:
        for item in choice:
            f.write(item + " ")
    
random_slice(1000000, data_path, slice_save_dir + "/1M.txt")
random_slice(1000, data_path, slice_save_dir + "/1K.txt")

### BBPE tokenizer training

In [1]:
import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
from pathlib import Path
import glob

class BPE_token(object):
    def __init__(self, vocab_size=100000, min_frequency=10):
        self.vocab_size = vocab_size
        self.min_frequency = min_frequency
        self.tokenizer = Tokenizer(BPE())
        self.tokenizer.normalizer = Sequence([
            NFKC()   # normalization of unicode characters (technicality)
        ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, path):
        trainer = BpeTrainer(vocab_size=self.vocab_size, min_frequency=self.min_frequency, show_progress=True, initial_alphabet=ByteLevel.alphabet(), 
                             special_tokens=["<eos>",
                                             "<unk>",
                                             "<pad>",
                                             "<bos>",
                                             "<neims>",
                                             "<nist>",
                                             "<rassp>",
                                             "<trafo>",
                                             "<source1>",
                                             "<source2>",
                                             "<source3>",])
        self.tokenizer.train(path, trainer)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

In [5]:
from tokenizers import Tokenizer

path = ["../tokenizer/training_data/1M.txt"]
mfs = [10000000]

for min_frequency in mfs:
    tokenizer = BPE_token(min_frequency=min_frequency)

    # train the tokenizer model
    tokenizer.bpe_train(path)

    # saving the tokenized data in our specified folder
    save_path = '../tokenizer/bbpe_tokenizer/' ####
    tokenizer.save_tokenizer(save_path)
    tokenizer.tokenizer.save(save_path + f"bart_bbpe_tokenizer_1M_mf{min_frequency}.model")    #####






In [6]:
from tokenizers import Tokenizer

save_path = '../tokenizer/bbpe_tokenizer/'
all_mfs = [3, 10, 30, 50, 100, 500, 1000, 3000, 5000, 6000, 10000, 50000, 10000000]

for min_frequency in all_mfs:
    # loading the saved tokenizer
    tokenizer = Tokenizer.from_file(save_path + f"/bart_bbpe_tokenizer_1M_mf{min_frequency}.model")

    print(f"bart_bbpe_tokenizer_1M_mf{min_frequency}\n" +
          "- max vocab_size 100000\n" + 
          f"- min_frequency {min_frequency}\n" +
          f"- final vocab size {len(tokenizer.get_vocab())} (including 11 special tokens)\n")

bart_bbpe_tokenizer_1M_mf3
- max vocab_size 100000
- min_frequency 3
- final vocab size 1827 (including 11 special tokens)

bart_bbpe_tokenizer_1M_mf10
- max vocab_size 100000
- min_frequency 10
- final vocab size 1286 (including 11 special tokens)

bart_bbpe_tokenizer_1M_mf30
- max vocab_size 100000
- min_frequency 30
- final vocab size 985 (including 11 special tokens)

bart_bbpe_tokenizer_1M_mf50
- max vocab_size 100000
- min_frequency 50
- final vocab size 887 (including 11 special tokens)

bart_bbpe_tokenizer_1M_mf100
- max vocab_size 100000
- min_frequency 100
- final vocab size 780 (including 11 special tokens)

bart_bbpe_tokenizer_1M_mf500
- max vocab_size 100000
- min_frequency 500
- final vocab size 583 (including 11 special tokens)

bart_bbpe_tokenizer_1M_mf1000
- max vocab_size 100000
- min_frequency 1000
- final vocab size 523 (including 11 special tokens)

bart_bbpe_tokenizer_1M_mf3000
- max vocab_size 100000
- min_frequency 3000
- final vocab size 427 (including 11 speci

ValueError: invalid literal for int() with base 10: '1e7'

### Try the BBPE out

In [19]:
from tokenizers import Tokenizer


# Initialize a tokenizer
# vocab = "./tokenizer/bbpe_tokenizer/vocab.json"
# merges = "./tokenizer/bbpe_tokenizer//merges.txt"
tok = "./tokenizer/bbpe_tokenizer/bart_bbpe_tokenizer_1M_mf3.model"
tokenizer = Tokenizer.from_file(tok)
# special_tokens_dict = {"bos_token": "<bos>", "unk_token": "<unk>", "eos_token": "<eos>", "sep_token": "<sep>"}
# special_tokens_dict = ["<sep>"]


# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

# And then encode:
encoded = tokenizer.encode("Cc1nn(C)c(C)c1S(=O)(=O)N1CCN2C(=O)CN(C)C(=O)")


In [None]:
df

In [60]:
print(encoded.tokens)

['Ġ', 'CS', '(=', 'O', ')(=', 'O', ')', 'NCC', '(=', 'O', ')', 'N', '[', 'C', '@', 'H', ']', '1', 'COCC', '[', 'C', '@@', 'H', ']', '1', 'Oc', '1', 'ccc', '(', 'C', '(', 'N', ')=', 'O', ')', 'cc', '1']


In [None]:
sorted(tokenizer.get_vocab())

In [None]:
bt = tokenizer.token_to_id("<bos>")
et = tokenizer.token_to_id("<eos>")
tok_smiles = [bt] + tokenizer.encode(df.smiles[0]).ids #+ [et] + (200-2-len(tokenizer.encode(df.smiles[0]))) * [pt]  

In [41]:
print(df.smiles[0])
tokenizer.encode(df.smiles[0]).ids

Cc1nn(C)c(C)c1S(=O)(=O)N1CCN2C(=O)CN(C)C(=O)[C@H]2C1


[224,
 278,
 20,
 282,
 11,
 38,
 12,
 70,
 11,
 38,
 12,
 70,
 20,
 54,
 260,
 50,
 270,
 50,
 12,
 49,
 20,
 267,
 21,
 38,
 260,
 50,
 12,
 266,
 11,
 38,
 12,
 38,
 260,
 50,
 263,
 38,
 35,
 43,
 64,
 21,
 38,
 20]

In [64]:
pd.read_pickle("data/trial_set/1K_bbpe_bart_prepared_data_train.pkl")

Unnamed: 0,destereo_smiles,input_ids,decoder_input_ids,encoder_attention_mask,decoder_attention_mask,labels,position_ids
629,COCCN1C(=O)C(=O)N(C1=O)CC(=O)c1c(N)n(C)c(=O)n(...,"[15, 28, 29, 30, 31, 32, 33, 39, 40, 41, 42, 4...","[3, 224, 325, 20, 38, 260, 50, 12, 38, 260, 50...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 325, 20, 38, 260, 50, 12, 38, 260, 50...","[3, 1, 2, 5, 5, 4, 4, 5, 6, 7, 8, 6, 7, 9, 6, ..."
338,O=C(NC1CCS(=O)(=O)C1)CCC(=O)NC1CCS(=O)(=O)C1,"[17, 18, 26, 27, 28, 29, 30, 31, 32, 33, 34, 3...","[3, 224, 50, 32, 38, 11, 272, 20, 290, 260, 50...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 50, 32, 38, 11, 272, 20, 290, 260, 50...","[1, 1, 3, 6, 6, 7, 4, 5, 2, 1, 1, 2, 4, 7, 6, ..."
620,CN(C(=O)c1c(C)nc2n1CCN(C2)C(=O)c1cc(=O)n(c(=O)...,"[30, 33, 34, 38, 39, 40, 41, 42, 43, 44, 45, 4...","[3, 224, 266, 11, 38, 260, 50, 12, 70, 20, 70,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 266, 11, 38, 260, 50, 12, 70, 20, 70,...","[2, 4, 0, 4, 7, 7, 7, 9, 7, 8, 6, 4, 4, 6, 6, ..."
396,Cn1ncc(c1)S(=O)(=O)N1CCN(CC1)S(=O)(=O)c1cnn(c1)C,"[33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 4...","[3, 224, 275, 20, 310, 11, 70, 20, 12, 54, 260...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 275, 20, 310, 11, 70, 20, 12, 54, 260...","[2, 3, 3, 1, 5, 6, 6, 6, 8, 6, 5, 3, 4, 4, 4, ..."
251,O=C(C1CNCC(C1)C(=O)N1CCOCC1)NCCc1nncn1C,"[33, 36, 39, 40, 41, 42, 43, 44, 45, 51, 52, 5...","[3, 224, 50, 32, 38, 11, 38, 20, 266, 261, 11,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 50, 32, 38, 11, 38, 20, 266, 261, 11,...","[4, 1, 8, 7, 9, 9, 8, 9, 2, 6, 8, 8, 8, 9, 9, ..."
...,...,...,...,...,...,...,...
308,OCCOCCNC(=O)C1CC(C(C1)O)NC(=O)c1nsnc1C,"[19, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 4...","[3, 224, 372, 291, 260, 50, 12, 38, 20, 261, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 372, 291, 260, 50, 12, 38, 20, 261, 1...","[3, 4, 3, 3, 5, 3, 4, 7, 6, 9, 9, 9, 9, 9, 7, ..."
674,COc1ccc(cc1C(=O)NC1COC2C1OCC2O)S(=O)(=O)N,"[26, 31, 33, 36, 38, 39, 40, 41, 42, 43, 44, 4...","[3, 224, 331, 20, 274, 11, 264, 20, 38, 260, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 331, 20, 274, 11, 264, 20, 38, 260, 5...","[0, 7, 2, 4, 1, 7, 4, 8, 8, 9, 8, 8, 7, 7, 6, ..."
631,O=C(Cn1ncn(c1=O)C)OCCCOC(=O)Cn1ncn(c1=O)C,"[29, 30, 31, 39, 40, 41, 42, 43, 44, 45, 52, 5...","[3, 224, 50, 32, 38, 11, 275, 20, 303, 11, 70,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 50, 32, 38, 11, 275, 20, 303, 11, 70,...","[3, 1, 3, 4, 6, 4, 8, 5, 6, 0, 2, 6, 6, 8, 8, ..."
526,OC(=O)CN(C(=O)C)CC1OCCN(C1)C(=O)CN1CSCC1=O,"[30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 4...","[3, 224, 280, 260, 50, 12, 266, 11, 38, 260, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 224, 280, 260, 50, 12, 266, 11, 38, 260, 5...","[3, 0, 0, 3, 2, 1, 4, 1, 6, 6, 8, 9, 9, 9, 8, ..."
