# BBPE SMILES Tokenizer
The goal of this notebook is to prepare training data and train a suitable SMILES tokenizer. We get inspired by the generative NLP approaches and try to train a BBPE tokenizer. The main advantage of this tokenization technique is, that since the training starts with a vocabulary with all on-byte ASCII char, we can be sure to tokenize any SMILES string without encountering an unknown token (for compatibility reasons we add it anyway though).

## Data preparation
As training data we take a 1M random slice from the 30M dataset we scraped from ZINC20 database. The SMILES are already deduplicated and canonical.

In [2]:
import numpy as np
from pathlib import Path

np.random.seed(42)

data_path = "../data/zinc/30M/30M.smi"
slice_save_dir = "training_data"
Path(slice_save_dir).mkdir(parents=True, exist_ok=True)

def random_slice(size, data_path, slice_save_path):
    with open(data_path, 'r') as f:
        data = np.array(f.read().splitlines())
        choice = np.random.choice(data, size, replace=False)

    with open(slice_save_path, 'w') as f:
        for item in choice:
            f.write(item + " ")
    
random_slice(1000000, data_path, slice_save_dir + "/1M.txt")

# BBPE tokenizer training

In [3]:
import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
from pathlib import Path
import glob

class BPE_token(object):
    def __init__(self, vocab_size=100000, min_frequency=10):
        self.vocab_size = vocab_size
        self.min_frequency = min_frequency
        self.tokenizer = Tokenizer(BPE())
        self.tokenizer.normalizer = Sequence([
            NFKC()   # normalization of unicode characters (technicality)
        ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, path):
        trainer = BpeTrainer(vocab_size=self.vocab_size, min_frequency=self.min_frequency, show_progress=True, initial_alphabet=ByteLevel.alphabet(), 
                             special_tokens=["<eos>",
                                             "<unk>",
                                             "<pad>",
                                             "<bos>",
                                             "<neims>",
                                             "<nist>",
                                             "<rassp>",
                                             "<trafo>",
                                             "<source1>",
                                             "<source2>",
                                             "<source3>",])
        self.tokenizer.train(path, trainer)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

In [7]:
from tokenizers import Tokenizer

train_data_path = slice_save_dir + "/1M.txt"
mfs = [10, 100, 10000, 10000000]

for min_frequency in mfs:
    tokenizer = BPE_token(min_frequency=min_frequency)

    # train the tokenizer model
    tokenizer.bpe_train([train_data_path])

    # saving the tokenized data in our specified folder
    save_path = '' ####
    tokenizer.save_tokenizer(save_path)
    tokenizer.tokenizer.save(save_path + f"/test_tokenizer_mf{min_frequency}.model")    #####















## Check the stats

In [3]:
from tokenizers import Tokenizer

save_path = ''
all_mfs = ['10', '100', '10K', '10M']

for min_frequency in all_mfs:
    # loading the saved tokenizer
    tokenizer = Tokenizer.from_file(save_path + f"tokenizer_mf{min_frequency}.model")

    print(f"tokenizer_1M_mf{min_frequency}\n" +
          "- max vocab_size 100000\n" + 
          f"- min_frequency {min_frequency}\n" +
          f"- final vocab size {len(tokenizer.get_vocab())} (including 11 special tokens)\n")

tokenizer_1M_mf10
- max vocab_size 100000
- min_frequency 10
- final vocab size 1286 (including 11 special tokens)

tokenizer_1M_mf100
- max vocab_size 100000
- min_frequency 100
- final vocab size 780 (including 11 special tokens)

tokenizer_1M_mf10K
- max vocab_size 100000
- min_frequency 10K
- final vocab size 367 (including 11 special tokens)

tokenizer_1M_mf10M
- max vocab_size 100000
- min_frequency 10M
- final vocab size 267 (including 11 special tokens)



## Try the BBPE out

In [12]:
from tokenizers import Tokenizer


# Initialize a tokenizer
# vocab = "./tokenizer/bbpe_tokenizer/vocab.json"
# merges = "./tokenizer/bbpe_tokenizer//merges.txt"
tok = "tokenizer_mf10.model"
tokenizer = Tokenizer.from_file(tok)
# special_tokens_dict = {"bos_token": "<bos>", "unk_token": "<unk>", "eos_token": "<eos>", "sep_token": "<sep>"}
# special_tokens_dict = ["<sep>"]


# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

# And then encode:
encoded = tokenizer.encode("Cc1nn(C)c(C)c1S(=O)(=O)N1CCN2C(=O)CN(C)C(=O)")

print(encoded.tokens), print(encoded.ids)

['ĠCc', '1', 'nn', '(', 'C', ')', 'c', '(', 'C', ')', 'c', '1', 'S', '(=', 'O', ')(=', 'O', ')', 'N', '1', 'CCN', '2', 'C', '(=', 'O', ')', 'CN', '(', 'C', ')', 'C', '(=', 'O', ')']
[318, 27, 289, 18, 45, 19, 77, 18, 45, 19, 77, 27, 61, 269, 57, 294, 57, 19, 56, 27, 279, 28, 45, 269, 57, 19, 280, 18, 45, 19, 45, 269, 57, 19]


(None, None)