# Tokenization
This notebook builds two subword tokenizers using TensorFlow's `text.BertTokenizer`. Based on the [Subword Tokenizer Tutorial](https://www.tensorflow.org/text/guide/subwords_tokenizer#setup) from TensorFlow.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import pathlib
import pandas as pd
import tensorflow as tf
import tensorflow_text as text
from tensorflow_text.tools.wordpiece_vocab import (
    bert_vocab_from_dataset as bert_vocab,
)
import config

## Dataset
Load the source dataset from XXX, split into a training and validation set and save to disk.

In [3]:
df = pd.read_csv(
    config.RAW_DATA_PATH, sep="\t", names=["eng", "spa"], usecols=[0, 1]
)
n_samples = df.shape[0]
dataset = tf.data.Dataset.from_tensor_slices(df)
dataset = dataset.shuffle(config.BUFFER_SIZE)
val_size = int(n_samples * config.VALIDATION_SHARE)
val_dataset = dataset.take(val_size)
train_dataset = dataset.skip(val_size)

In [4]:
val_dataset.save(config.VAL_DATA_PATH)
train_dataset.save(config.TRAIN_DATA_PATH)

In [5]:
for eng, spa in train_dataset.take(1):
    print(f"English: {eng.numpy().decode('utf-8')}")
    print(f"Spanish: {spa.numpy().decode('utf-8')}")

English: I like your phone.
Spanish: Me gusta tu teléfono.


## Vocabulary
Generate the vocabularies, one for English and one for Spanish

In [6]:
train_eng = train_dataset.map(lambda x: x[0])
train_spa = train_dataset.map(lambda x: x[1])

In [7]:
def write_vocab_file(filepath, vocab):
    with open(filepath, "w") as f:
        for token in vocab:
            print(token, file=f)

In [8]:
bert_vocab_args = dict(
    vocab_size=config.VOCAB_SIZE,
    reserved_tokens=config.RESERVED_TOKENS,
    bert_tokenizer_params=config.BERT_TOKENIZER_PARAMS,
    learn_params={},
)
eng_vocab = bert_vocab.bert_vocab_from_dataset(
    train_eng.batch(1000).prefetch(2), **bert_vocab_args
)
spa_vocab = bert_vocab.bert_vocab_from_dataset(
    train_spa.batch(1000).prefetch(2), **bert_vocab_args
)

In [9]:
print(eng_vocab[:10])
print(eng_vocab[100:110])
print(eng_vocab[1000:1010])
print(eng_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', '%', "'", '(']
['how', 'her', 'time', 'has', 'as', 'did', 'will', 'very', 'here', 'didn']
['ability', 'drank', 'law', 'plays', 'airport', 'begin', 'difference', 'learning', 'past', 'pick']
['##:', '##;', '##?', '##j', '##q', '##v', '##z', '##°', '##’', '##€']


In [10]:
eng_vocab_path = config.DATA_DIR + "/eng_vocab.txt"
spa_vocab_path = config.DATA_DIR + "/spa_vocab.txt"
write_vocab_file(eng_vocab_path, eng_vocab)
write_vocab_file(spa_vocab_path, spa_vocab)

## Tokenizer
Build and test the tokenizers

In [11]:
eng_tokenizer = text.BertTokenizer(
    eng_vocab_path, **config.BERT_TOKENIZER_PARAMS
)
spa_tokenizer = text.BertTokenizer(
    spa_vocab_path, **config.BERT_TOKENIZER_PARAMS
)

In [12]:
for eng_examples in train_eng.batch(3).take(1):
    for ex in eng_examples:
        print(ex.numpy().decode("utf-8"))

That guy has a screw loose!
What's on TV?
He's a bit drunk.


In [13]:
# Tokenize the examples -> (batch, word, word-piece)
token_batch = eng_tokenizer.tokenize(eng_examples)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2, -1)

for ex in token_batch.to_list():
    print(ex)

[65, 1034, 103, 29, 47, 2092, 1447, 40, 2318, 1017, 4]
[77, 8, 47, 86, 460, 28]
[64, 8, 47, 29, 524, 797, 14]


In [14]:
# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(eng_vocab, token_batch)
# Join with spaces.
for ex in tf.strings.reduce_join(txt_tokens, separator=" ", axis=-1):
    print(ex.numpy().decode("utf-8"))

that guy has a s ##c ##rew l ##oo ##se !
what ' s on tv ?
he ' s a bit drunk .


In [15]:
words = eng_tokenizer.detokenize(token_batch)
for ex in tf.strings.reduce_join(words, separator=" ", axis=-1):
    print(ex.numpy().decode("utf-8"))

that guy has a screw loose !
what ' s on tv ?
he ' s a bit drunk .


## Customization and export
Define a custom tokenizer class that can be exported and used in the Transformer, including functionality for adding [START]/[END] tokens and cleaning up output after detokenization.

In [16]:
START = tf.argmax(tf.constant(config.RESERVED_TOKENS) == "[START]")
END = tf.argmax(tf.constant(config.RESERVED_TOKENS) == "[END]")


def add_start_end(ragged):
    count = ragged.bounding_shape()[0]
    starts = tf.fill([count, 1], START)
    ends = tf.fill([count, 1], END)
    return tf.concat([starts, ragged, ends], axis=1)


def cleanup_text(reserved_tokens, token_txt):
    # Drop the reserved tokens, except for "[UNK]".
    bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
    bad_token_re = "|".join(bad_tokens)

    bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
    result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

    # Join them into strings.
    result = tf.strings.reduce_join(result, separator=" ", axis=-1)

    return result


class CustomTokenizer(tf.Module):
    def __init__(self, config, vocab_path):
        self.tokenizer = text.BertTokenizer(
            vocab_path, **config.BERT_TOKENIZER_PARAMS
        )
        self._reserved_tokens = config.RESERVED_TOKENS
        self._vocab_path = tf.saved_model.Asset(vocab_path)

        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab = tf.Variable(vocab)

        ## Create the signatures for export:

        # Include a tokenize signature for a batch of strings.
        self.tokenize.get_concrete_function(
            tf.TensorSpec(shape=[None], dtype=tf.string)
        )

        # Include `detokenize` and `lookup` signatures for:
        #   * `Tensors` with shapes [tokens] and [batch, tokens]
        #   * `RaggedTensors` with shape [batch, tokens]
        self.detokenize.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64)
        )
        self.detokenize.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64)
        )

        self.lookup.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64)
        )
        self.lookup.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64)
        )

        # These `get_*` methods take no arguments
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()

    @tf.function
    def tokenize(self, strings):
        enc = self.tokenizer.tokenize(strings)
        # Merge the `word` and `word-piece` axes.
        enc = enc.merge_dims(-2, -1)
        enc = add_start_end(enc)
        return enc

    @tf.function
    def detokenize(self, tokenized):
        words = self.tokenizer.detokenize(tokenized)
        return cleanup_text(self._reserved_tokens, words)

    @tf.function
    def lookup(self, token_ids):
        return tf.gather(self.vocab, token_ids)

    @tf.function
    def get_vocab_size(self):
        return tf.shape(self.vocab)[0]

    @tf.function
    def get_vocab_path(self):
        return self._vocab_path

    @tf.function
    def get_reserved_tokens(self):
        return tf.constant(self._reserved_tokens)

In [17]:
token_batch = eng_tokenizer.tokenize(eng_examples).merge_dims(-2, -1)
token_batch = add_start_end(token_batch)
words = eng_tokenizer.detokenize(token_batch)
words

<tf.RaggedTensor [[b'[START]', b'that', b'guy', b'has', b'a', b'screw', b'loose', b'!',
  b'[END]']                                                           ,
 [b'[START]', b'what', b"'", b's', b'on', b'tv', b'?', b'[END]'],
 [b'[START]', b'he', b"'", b's', b'a', b'bit', b'drunk', b'.', b'[END]']]>

In [18]:
cleanup_text(config.RESERVED_TOKENS, words).numpy()

array([b'that guy has a screw loose !', b"what ' s on tv ?",
       b"he ' s a bit drunk ."], dtype=object)

In [19]:
tokenizers = tf.Module()
tokenizers.eng = CustomTokenizer(config, eng_vocab_path)
tokenizers.spa = CustomTokenizer(config, spa_vocab_path)

In [20]:
tf.saved_model.save(tokenizers, config.TOKENIZER_PATH)

INFO:tensorflow:Assets written to: tokenizer/assets


In [21]:
reloaded_tokenizers = tf.saved_model.load(config.TOKENIZER_PATH)
reloaded_tokenizers.eng.get_vocab_size().numpy()

2909

In [22]:
tokens = reloaded_tokenizers.eng.tokenize(["Hello TensorFlow!"])
tokens.numpy()

array([[   2, 2231,  427,   78,  510,  962, 2002,    4,    3]])

In [23]:
text_tokens = reloaded_tokenizers.eng.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'hello', b'ten', b'##s', b'##or', b'##f', b'##low', b'!',
  b'[END]']]>

In [24]:
round_trip = reloaded_tokenizers.eng.detokenize(tokens)
print(round_trip.numpy()[0].decode("utf-8"))

hello tensorflow !
