# Tokenization
This notebook builds two subword tokenizers using TensorFlow's `text.BertTokenizer`. Based on the [Subword Tokenizer Tutorial](https://www.tensorflow.org/text/guide/subwords_tokenizer#setup) from TensorFlow.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import os
import pathlib
import pandas as pd
import tensorflow as tf
import tensorflow_text as text
from tensorflow_text.python.ops import bert_tokenizer
from tensorflow_text.tools.wordpiece_vocab import wordpiece_tokenizer_learner_lib as learner
import config

## Load dataset
Load the source text from the concatenated [concatenated works of Shakespeare](https://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt).

In [3]:
with open(config.RAW_DATA_PATH, 'r') as file:
    shakespeare_plays = file.read()

In [4]:
sample = shakespeare_plays[:147]
print(sample)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?


## Vocabulary
Generate the vocabulary.

In [5]:
tokenizer = bert_tokenizer.BasicTokenizer(**config.BERT_TOKENIZER_PARAMS)
words_dataset = tokenizer.tokenize(shakespeare_plays)
word_counts = learner.count_words(words_dataset)
vocab = learner.learn(word_counts, config.VOCAB_SIZE, config.RESERVED_TOKENS, **config.LEARN_PARAMS)

In [6]:
print(vocab[:10])
print(vocab[100:110])
print(vocab[-10:])

with open(config.VOCAB_PATH, "w") as f:
    for token in vocab:
        print(token, file=f)

['[PAD]', '[UNK]', '[START]', '[END]', '!', '$', '&', "'", ',', '-']
['well', 'was', 'which', 'there', 'how', 'am', 'then', '##ed', '##ing', 'man']
['##.', '##3', '##:', '##;', '##?', '##[', '##]', '##j', '##q', '##v']


## Tokenizer
Build and test the tokenizer.

In [7]:
tokenizer = text.BertTokenizer(config.VOCAB_PATH, **config.BERT_TOKENIZER_PARAMS)

In [13]:
tokens = tokenizer.tokenize(sample).merge_dims(-2, -1)
print(tokens)

<tf.RaggedTensor [[140, 606, 12, 196, 76, 1417, 178, 539, 8, 170, 53, 147, 10, 72, 12, 147,
  8, 147, 10, 140, 606, 12, 47, 80, 72, 1917, 361, 45, 269, 115, 45, 4344,
  14]]>


In [15]:
txt_tokens = tf.gather(vocab, tokens)
txt_tokens = tf.strings.reduce_join(txt_tokens, separator=" ", axis=-1).numpy()[0].decode("utf-8")
print(txt_tokens)

first citizen : before we proceed any further , hear me speak . all : speak , speak . first citizen : you are all resolved rather to die than to famish ?


## Customization and export
Define a custom tokenizer class that can be exported and used in the GPT, including functionality for cleaning up output after detokenization.

In [20]:
def cleanup_text(reserved_tokens, token_txt):
    # Drop the reserved tokens, except for "[UNK]".
    bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
    bad_token_re = "|".join(bad_tokens)

    bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
    result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

    # Join them into strings.
    result = tf.strings.reduce_join(result, separator=" ", axis=-1)

    return result


class CustomTokenizer(tf.Module):
    def __init__(self, config):
        self.tokenizer = text.BertTokenizer(
            config.VOCAB_PATH, **config.BERT_TOKENIZER_PARAMS
        )
        self._reserved_tokens = config.RESERVED_TOKENS
        self._vocab_path = tf.saved_model.Asset(config.VOCAB_PATH)

        vocab = pathlib.Path(config.VOCAB_PATH).read_text().splitlines()
        self.vocab = tf.Variable(vocab)

        ## Create the signatures for export:

        # Include a tokenize signature for a batch of strings.
        self.tokenize.get_concrete_function(
            tf.TensorSpec(shape=[None], dtype=tf.string)
        )

        # Include `detokenize` and `lookup` signatures for:
        #   * `Tensors` with shapes [tokens] and [batch, tokens]
        #   * `RaggedTensors` with shape [batch, tokens]
        self.detokenize.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64)
        )
        self.detokenize.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64)
        )

    @tf.function
    def tokenize(self, strings):
        enc = self.tokenizer.tokenize(strings)
        enc = enc.merge_dims(-2, -1)
        return enc

    @tf.function
    def detokenize(self, tokenized):
        words = self.tokenizer.detokenize(tokenized)
        return cleanup_text(self._reserved_tokens, words)

In [24]:
tokenizer = CustomTokenizer(config)
tf.saved_model.save(tokenizer, config.TOKENIZER_PATH)
reloaded_tokenizer = tf.saved_model.load(config.TOKENIZER_PATH)

INFO:tensorflow:Assets written to: tokenizer/assets


INFO:tensorflow:Assets written to: tokenizer/assets


In [26]:
tokens = reloaded_tokenizer.tokenize(["Hello TensorFlow!"])
tokens.numpy()

array([[ 647,  650,  736,   63,  866, 2003, 4975,    4]])

In [27]:
round_trip = reloaded_tokenizer.detokenize(tokens)
print(round_trip.numpy()[0].decode("utf-8"))

hello tensorflow !


## Save tokenized dataset
TODO: Split tokenized dataset into batches and then separate into train and validation sets. Store the resulting tokenized datasets on disk.

In [9]:
dataset = dataset.shuffle(config.BUFFER_SIZE).take(config.N_SENTENCES)
n_samples = min(config.N_SENTENCES, n_samples)
val_size = int(n_samples * config.VALIDATION_SHARE)
val_dataset = dataset.take(val_size)
train_dataset = dataset.skip(val_size)

NameError: name 'dataset' is not defined

In [None]:
val_dataset.save(config.VAL_DATA_PATH)
train_dataset.save(config.TRAIN_DATA_PATH)

In [None]:
for eng, nor in train_dataset.take(1):
    print(f"English: {eng.numpy().decode('utf-8')}")
    print(f"Norwegian: {nor.numpy().decode('utf-8')}")