# Tokenization
This notebook builds two subword tokenizers using TensorFlow's `text.BertTokenizer`. Based on the [Subword Tokenizer Tutorial](https://www.tensorflow.org/text/guide/subwords_tokenizer#setup) from TensorFlow.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import pathlib
import pandas as pd
import tensorflow as tf
import tensorflow_text as text
from tensorflow_text.tools.wordpiece_vocab import (
    bert_vocab_from_dataset as bert_vocab,
)
from tensorflow_text.python.ops.normalize_ops import case_fold_utf8
import config

## Dataset
Load the source dataset from XXX, remove sentences that are too long or contains characters outside Latin alphabet no. 1. Select the desired number of sentences from the processed dataset as training on the full dataset will take too long. Split into a training and validation set and save to disk.

In [3]:
%%time
df = pd.read_csv(
    config.RAW_DATA_PATH,
    sep="\t",
    names=["eng", "nor"],
    on_bad_lines="skip",
).astype(str)

CPU times: user 40.9 s, sys: 2.19 s, total: 43.1 s
Wall time: 43.1 s


In [4]:
# Regular Expression for non Latin-1 characters
non_latin1_regex = re.compile(r"[^\x00-\xFF]")


def filter_sentence(row):
    for sentence in row:
        if len(sentence) > config.SENTENCE_MAX_LEN:
            return False
        if bool(non_latin1_regex.search(sentence)):
            return False
    return True


mask = df.apply(filter_sentence, axis=1)
df = df[mask].copy()

In [5]:
n_samples = mask.sum()
n_sentences = mask.shape[0]
print(
    f"{(1 - n_samples/n_sentences) * 100:.2f}% of {n_sentences:,} sentences filtered out."
)

10.03% of 18,605,836 sentences filtered out.


In [6]:
dataset = tf.data.Dataset.from_tensor_slices(df)
dataset = dataset.shuffle(config.BUFFER_SIZE).take(config.N_SENTENCES)
n_samples = min(config.N_SENTENCES, n_samples)
val_size = int(n_samples * config.VALIDATION_SHARE)
val_dataset = dataset.take(val_size)
train_dataset = dataset.skip(val_size)

In [7]:
val_dataset.save(config.VAL_DATA_PATH)
train_dataset.save(config.TRAIN_DATA_PATH)

In [8]:
for eng, nor in train_dataset.take(1):
    print(f"English: {eng.numpy().decode('utf-8')}")
    print(f"Norwegian: {nor.numpy().decode('utf-8')}")

English: Betrally Sportsbook website is excellent in functionality content.
Norwegian: Betrally Sportsbook nettstedet er utmerket i funksjonalitet innhold.


## Vocabulary
Generate the vocabularies, one for English and one for Norwegian, based on a lower case subset of the training data.

In [11]:
vocab_sample = train_dataset.take(config.N_SAMPLES_TOKENIZER)
sample_eng = vocab_sample.map(lambda x: case_fold_utf8(x[0]))
sample_nor = vocab_sample.map(lambda x: case_fold_utf8(x[1]))

In [12]:
def write_vocab_file(filepath, vocab):
    with open(filepath, "w") as f:
        for token in vocab:
            print(token, file=f)

In [13]:
%%time
bert_vocab_args = dict(
    vocab_size=config.VOCAB_SIZE,
    reserved_tokens=config.RESERVED_TOKENS,
    bert_tokenizer_params=config.BERT_TOKENIZER_PARAMS,
)
eng_vocab = bert_vocab.bert_vocab_from_dataset(
    sample_eng.batch(1000).prefetch(2), **bert_vocab_args
)
nor_vocab = bert_vocab.bert_vocab_from_dataset(
    sample_nor.batch(1000).prefetch(2), **bert_vocab_args
)

CPU times: user 3min 57s, sys: 3.95 s, total: 4min 1s
Wall time: 3min 57s


In [14]:
print(eng_vocab[:10])
print(eng_vocab[100:110])
print(eng_vocab[1000:1010])
print(eng_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&']
['ò', 'ó', 'ô', 'ö', 'ø', 'ù', 'ú', 'ü', 'ý', 'þ']
['format', 'meet', 'plan', 'problem', 'return', 'solutions', 'class', 'etc', 'future', 'grand']
['##ö', '##ù', '##ú', '##ü', '##ý', '##þ', '##́', '##̈', '##μ', '##⁄']


In [15]:
eng_vocab_path = config.DATA_DIR + "/eng_vocab.txt"
nor_vocab_path = config.DATA_DIR + "/nor_vocab.txt"

In [16]:
write_vocab_file(eng_vocab_path, eng_vocab)
write_vocab_file(nor_vocab_path, nor_vocab)

## Tokenizer
Build and test the tokenizers

In [17]:
eng_tokenizer = text.BertTokenizer(
    eng_vocab_path, **config.BERT_TOKENIZER_PARAMS
)
nor_tokenizer = text.BertTokenizer(
    nor_vocab_path, **config.BERT_TOKENIZER_PARAMS
)

In [18]:
for eng_examples in sample_eng.batch(3).take(1):
    for ex in eng_examples:
        print(ex.numpy().decode("utf-8"))

we can help you to find the perfect mercato san severino hotel room for almost any occaision.
i feel kinda dizzy!
well apparently it does, hence the need for the definition of the term kip.


In [19]:
# Tokenize the examples -> (batch, word, word-piece)
token_batch = eng_tokenizer.tokenize(eng_examples)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2, -1)

for ex in token_batch.to_list():
    print(ex)

[139, 133, 203, 119, 116, 151, 114, 298, 416, 5706, 6285, 556, 60, 3475, 2445, 129, 176, 120, 550, 172, 3124, 17]
[50, 914, 1257, 154, 1763, 499, 499, 186, 4]
[206, 539, 2257, 4606, 136, 414, 15, 5073, 114, 240, 120, 114, 4186, 117, 114, 1447, 52, 3153, 17]


In [20]:
# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(eng_vocab, token_batch)
# Join with spaces.
for ex in tf.strings.reduce_join(txt_tokens, separator=" ", axis=-1):
    print(ex.numpy().decode("utf-8"))

we can help you to find the perfect me ##rc ##ato san s ##ever ##ino hotel room for almost any occaision .
i feel kind ##a di ##z ##z ##y !
well app ##are ##ntly it does , hence the need for the definition of the term k ##ip .


In [21]:
words = eng_tokenizer.detokenize(token_batch)
for ex in tf.strings.reduce_join(words, separator=" ", axis=-1):
    print(ex.numpy().decode("utf-8"))

we can help you to find the perfect mercato san severino hotel room for almost any occaision .
i feel kinda dizzy !
well apparently it does , hence the need for the definition of the term kip .


## Customization and export
Define a custom tokenizer class that can be exported and used in the Transformer, including functionality for adding [START]/[END] tokens and cleaning up output after detokenization.

In [22]:
START = tf.argmax(tf.constant(config.RESERVED_TOKENS) == "[START]")
END = tf.argmax(tf.constant(config.RESERVED_TOKENS) == "[END]")


def add_start_end(ragged):
    count = ragged.bounding_shape()[0]
    starts = tf.fill([count, 1], START)
    ends = tf.fill([count, 1], END)
    return tf.concat([starts, ragged, ends], axis=1)


def cleanup_text(reserved_tokens, token_txt):
    # Drop the reserved tokens, except for "[UNK]".
    bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
    bad_token_re = "|".join(bad_tokens)

    bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
    result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

    # Join them into strings.
    result = tf.strings.reduce_join(result, separator=" ", axis=-1)

    return result


class CustomTokenizer(tf.Module):
    def __init__(self, config, vocab_path):
        self.tokenizer = text.BertTokenizer(
            vocab_path, **config.BERT_TOKENIZER_PARAMS
        )
        self._reserved_tokens = config.RESERVED_TOKENS
        self._vocab_path = tf.saved_model.Asset(vocab_path)

        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab = tf.Variable(vocab)

        ## Create the signatures for export:

        # Include a tokenize signature for a batch of strings.
        self.tokenize.get_concrete_function(
            tf.TensorSpec(shape=[None], dtype=tf.string)
        )

        # Include `detokenize` and `lookup` signatures for:
        #   * `Tensors` with shapes [tokens] and [batch, tokens]
        #   * `RaggedTensors` with shape [batch, tokens]
        self.detokenize.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64)
        )
        self.detokenize.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64)
        )

        self.lookup.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64)
        )
        self.lookup.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64)
        )

        # These `get_*` methods take no arguments
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()

    @tf.function
    def tokenize(self, strings):
        strings = case_fold_utf8(strings)
        enc = self.tokenizer.tokenize(strings)
        # Merge the `word` and `word-piece` axes.
        enc = enc.merge_dims(-2, -1)
        enc = add_start_end(enc)
        return enc

    @tf.function
    def detokenize(self, tokenized):
        words = self.tokenizer.detokenize(tokenized)
        return cleanup_text(self._reserved_tokens, words)

    @tf.function
    def lookup(self, token_ids):
        return tf.gather(self.vocab, token_ids)

    @tf.function
    def get_vocab_size(self):
        return tf.shape(self.vocab)[0]

    @tf.function
    def get_vocab_path(self):
        return self._vocab_path

    @tf.function
    def get_reserved_tokens(self):
        return tf.constant(self._reserved_tokens)

In [23]:
token_batch = eng_tokenizer.tokenize(eng_examples).merge_dims(-2, -1)
token_batch = add_start_end(token_batch)
words = eng_tokenizer.detokenize(token_batch)
words

<tf.RaggedTensor [[b'[START]', b'we', b'can', b'help', b'you', b'to', b'find', b'the',
  b'perfect', b'mercato', b'san', b'severino', b'hotel', b'room', b'for',
  b'almost', b'any', b'occaision', b'.', b'[END]']                       ,
 [b'[START]', b'i', b'feel', b'kinda', b'dizzy', b'!', b'[END]'],
 [b'[START]', b'well', b'apparently', b'it', b'does', b',', b'hence',
  b'the', b'need', b'for', b'the', b'definition', b'of', b'the', b'term',
  b'kip', b'.', b'[END]']                                                ]>

In [24]:
cleanup_text(config.RESERVED_TOKENS, words).numpy()

array([b'we can help you to find the perfect mercato san severino hotel room for almost any occaision .',
       b'i feel kinda dizzy !',
       b'well apparently it does , hence the need for the definition of the term kip .'],
      dtype=object)

In [25]:
tokenizers = tf.Module()
tokenizers.eng = CustomTokenizer(config, eng_vocab_path)
tokenizers.nor = CustomTokenizer(config, nor_vocab_path)

In [26]:
tf.saved_model.save(tokenizers, config.TOKENIZER_PATH)

INFO:tensorflow:Assets written to: tokenizer/assets


In [27]:
reloaded_tokenizers = tf.saved_model.load(config.TOKENIZER_PATH)
reloaded_tokenizers.eng.get_vocab_size().numpy()

7585

In [28]:
tokens = reloaded_tokenizers.eng.tokenize(["Hello TensorFlow!"])
tokens.numpy()

array([[   2, 4864,  181, 2617, 4993, 5446, 4977,    4,    3]])

In [29]:
text_tokens = reloaded_tokenizers.eng.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'hell', b'##o', b'ten', b'##so', b'##rf', b'##low', b'!',
  b'[END]']]>

In [30]:
round_trip = reloaded_tokenizers.eng.detokenize(tokens)
print(round_trip.numpy()[0].decode("utf-8"))

hello tensorflow !
