In [None]:
from tqdm import tqdm

In [None]:
textfile= 'shakespeare.txt'
with open(textfile, 'r') as f:
    fulltext = f.read()

shorttext = 'The quick brown fox jumped over the lazy dog.'

In [None]:
def tokenize(vocab, word):
    toks = []
    s = word

    while len(s) > 0:
        prefix_strs = [s[0:x] for x in range(len(s) + 1)][::-1]
        for x in prefix_strs:
            if x in vocab:
                toks.append(x)
                s = s.replace(x, '', 1)
                break
    return toks

In [None]:
blah = "Eligendi et vero blanditiis a debitis."
tokenize(set(blah) | {'ve', 'ro'}, "vero")

In [None]:
def train(text, vocab_limit):
    vocab = set(text)
    words = text.split(' ')

    while(len(vocab) < vocab_limit):
        # First, tokenize the data with the existing vocab, then generate all pairs
        # as proposed new tokens
        new_toks = set()
        for w in words:
            toks = tokenize(vocab, w)
            pairs = [''.join(x) for x in zip(toks[:-1], toks[1:])]
            new_toks.update(pairs)
    
        # Next, retokenize with the new pairs added to the vocab, counting the number of occurrences of each new tok.
        stats = {x:0 for x in new_toks}
        for w in words:
            toks = tokenize(vocab | new_toks, w)
            for t in toks:
                if t in stats.keys():
                    stats[t] += 1

        # Look at how often each proposed token was used, and add the most frequent one to the vocab.
        tmp = [(stats[x], x) for x in stats]
        tmp.sort(key=lambda x: x[0])

        # Traverse the sorted list backwards to find the tokens with the most occurrences.
        # Go 4 at a time to speed things along
        for x in range(1, 5, 1):
            k, v = tmp[-x]
            vocab.update([v])
            print("New token >> {} << which occurred {} times".format(v, k))
        
        print("Vocab size ", len(vocab))

    # Stop when vocab reaches the target size.
    return vocab


In [None]:
trained_vocab = train(fulltext, 256)

In [None]:
tokenize(trained_vocab, "We are accounted poor citizens, the patricians good.")

In [None]:
id_to_str = {i: x for i, x in enumerate(list(trained_vocab))}
str_to_id = {v: k for k, v in id_to_str.items()}

In [None]:
import pickle as pkl
with open('tokenizer.pkl', 'wb') as t:
    pkl.dump({'id2str': id_to_str, 'str2id': str_to_id}, t)