In [None]:
!pip install datasets transformers[sentencepiece]

In [2]:
from math import log
from more_itertools import locate
import re
from collections import defaultdict
from transformers import AutoTokenizer

In [3]:
corpus = [
    "A lambda function is a small anonymous function",
    "A lambda function can take any number of arguments, but can only have one expression"
]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

In [None]:
word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1
word_freqs

In [6]:
char_freqs = defaultdict(int)
subwords_freqs = defaultdict(int)
for word, freq in word_freqs.items():
    for i in range(len(word)):
        char_freqs[word[i]] += freq
        # Loop through the subwords of length at least 2
        for j in range(i + 2, len(word) + 1):
            subwords_freqs[word[i:j]] += freq

# Sort subwords by frequency
sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)

In [7]:
init_vocab_size = 200
token_freqs = list(char_freqs.items()) + sorted_subwords[: init_vocab_size - len(char_freqs)]
token_freqs = {token: freq for token, freq in token_freqs}

In [8]:
total_sum = sum([freq for token, freq in token_freqs.items()])
model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

In [11]:
OPEN_BRACKET = "{"
CLOSED_BRACKET = "}"
EMPTY_STRING = ""

def create_tokens(s, i=0, out=EMPTY_STRING,permuts = []):
    if i == len(s):
      permuts.append(out)
    for j in reversed(range(i, len(s))):
      substr = OPEN_BRACKET + s[i:j+1] + CLOSED_BRACKET
      create_tokens(s, j + 1, out + substr,permuts)
    return permuts

def find_in_vocab(word_vec,vocab):
  for i in range(len(word_vec)):
    if word_vec[i] not in vocab:
      return False
  return True

def gen_token(s, model):
  permuts = create_tokens(s,permuts = [])
  word_permuts  = [permuts[i].replace("}{"," ").replace("}","").replace("{","")  for i in range(len(permuts))]
  lengths = [len(permuts[i].split(" "))  for i in range(len(permuts))]
  min_len = min(lengths)
  indexes = list(locate(lengths, lambda x: x == min_len))
  word_permuts_new = [word_permuts[i] for i in indexes if find_in_vocab(word_permuts[i].split(" "),list(model.keys()))]
  if len(word_permuts_new)==0:
    return ["</unkwn>"]
  logp = [0]*len(word_permuts_new)
  for i in range(len(word_permuts_new)):
    temp = 1
    for j in word_permuts_new[i].split():
      temp += model[j]
    logp[i] = temp
  return word_permuts_new[logp.index(min(logp))].split(),min(logp)

In [None]:
print(gen_token("Hopefully", model))
print(gen_token("This", model))

In [13]:
def compute_loss(model):
    loss = 0
    for word, freq in word_freqs.items():
        _, word_loss = gen_token(word, model)
        loss += freq * word_loss
    return loss

In [14]:
import copy
def compute_scores(model):
    scores = {}
    model_loss = compute_loss(model)
    for token, score in model.items():
        if len(token) == 1:
            continue
        model_without_token = copy.deepcopy(model)
        _ = model_without_token.pop(token)
        scores[token] = compute_loss(model_without_token) - model_loss
    return scores

In [16]:
percent_to_remove = 0.1
while len(model) > 100:
    scores = compute_scores(model)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1])
    # Remove percent_to_remove tokens with the lowest scores.
    for i in range(int(len(model) * percent_to_remove)):
        _ = token_freqs.pop(sorted_scores[i][0])

    total_sum = sum([freq for token, freq in token_freqs.items()])
    model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

In [None]:
def tokenize(text, model):
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in words_with_offsets]
    encoded_words = [gen_token(word, model)[0] for word in pre_tokenized_text]
    return sum(encoded_words, [])


tokenize("This is the Hugging Face course.", model)