In [None]:
import re
import tensorflow as tf
from collections import defaultdict, Counter

In [None]:
# Get Shakespeares work from Andrej Karpathy's website
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
filepath = tf.keras.utils.get_file('shakespeare.txt', url)

with open(filepath) as f:
  shakespeare_text = f.read()

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# Split into words
words = shakespeare_text.split()

In [None]:
# Extract individual words with their frequencies
def get_word_counts(text, return_sorted=True):
  words = re.findall(r'\b\w+\b', text.lower())

  # Word counts (unsorted)
  word_counts = Counter(words)

  if return_sorted:
    # Sort the word counts by frequency (value) in descending order
    word_counts = dict(sorted(word_counts.items(), key=lambda item: item[1], reverse=True))

  return word_counts

In [None]:
corpus = get_word_counts(shakespeare_text)

In [None]:
len(corpus)

11456

In [None]:
# Print the first few items of the dictionary
dict(list(corpus.items())[:20])

{'the': 6287,
 'and': 5690,
 'i': 5111,
 'to': 4934,
 'of': 3760,
 'you': 3211,
 'my': 3120,
 'a': 3018,
 'that': 2664,
 'in': 2403,
 'is': 2118,
 'not': 2015,
 'for': 1926,
 's': 1859,
 'with': 1813,
 'it': 1773,
 'me': 1769,
 'be': 1710,
 'your': 1686,
 'he': 1606}

In [None]:
class BPETokenizerLayer(tf.keras.layers.Layer):
  def __init__(self, pairs=50, **kwargs):
    super().__init__(**kwargs)
    self.pairs = pairs
    self.merges = {}

  # Training the tokenizer by going over each word from the corpus
  def train(self, corpus):
    vocab = {" ".join(word) + " </w>": count for word, count in corpus.items()}

    while len(self.merges) < self.pairs+1:
      pairs = self.get_stats(vocab)  # Count adjacent pairs
      if not pairs:
        break  # No more pairs to merge

      best_pair = max(pairs, key=pairs.get)  # Most frequent pair

      #print(best_pair)
      self.merges[best_pair] = "".join(best_pair)  # Merge pair
      vocab = self.merge_vocab(best_pair, vocab)  # Update vocabulary

  # Compute the character pairs
  def get_stats(self, vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
      symbols = word.split()
      for i in range(len(symbols) - 1):
        pairs[(symbols[i], symbols[i + 1])] += freq
    return pairs

  # Add the character pairs to the vocabulary
  def merge_vocab(self, pair, vocab):
    new_vocab = {}
    bigram = re.escape(" ".join(pair))
    pattern = re.compile(r"(?<!\S)" + bigram + r"(?!\S)")

    for word in vocab:
      new_word = pattern.sub("".join(pair), word)
      new_vocab[new_word] = vocab[word]

    return new_vocab

  # Tokenization using the learned vocabulary
  def tokenize(self, text):
    word = list(text) + ["</w>"]
    while len(word) > 1:
      pairs = [(word[i], word[i + 1]) for i in range(len(word) - 1)]
      merge_candidates = [p for p in pairs if p in self.merges]

      if not merge_candidates:
        break  # No more merges possible

      best_pair = min(merge_candidates, key=lambda p: self.merges[p])
      new_word = []
      i = 0
      while i < len(word):
        if i < len(word) - 1 and (word[i], word[i + 1]) == best_pair:
          new_word.append("".join(best_pair))
          i += 2
        else:
          new_word.append(word[i])
          i += 1
      word = new_word

    return word

  # Make sure this works for a batch of strings
  def call(self, inputs):
    return tf.py_function(self._vectorized_tokenize, [inputs], tf.string)

  def _vectorized_tokenize(self, inputs):
    tokenized_texts = [" ".join(self.tokenize(text.numpy().decode())) for text in inputs]
    return tf.convert_to_tensor(tokenized_texts)

In [None]:
# Learn the byte pair encoding
bpe_layer = BPETokenizerLayer(pairs=1000)
bpe_layer.train(corpus)

In [None]:
# Test Keras Layer
sample_texts = tf.constant(["""To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer""".lower()])

tokenized_texts = bpe_layer(sample_texts)

# Convert to string
tokenized_texts = tf.strings.reduce_join(tokenized_texts, separator=' ').numpy().decode()

# This is the tokenize output
print("Tokenized Output:", tokenized_texts)

# This is the number of tokens in the outputted text
print("Number of Tokens:", len(tokenized_texts.split()))

Tokenized Output: to   be ,   or   no t   to   be ,   th at   is   t he   qu es ti on : 
 w he t her   ' t is   no bl er   in   t he   min d   to   su f f er </w>
Number of Tokens: 40
