# Tokenization
This notebook builds two subword tokenizers using Tensorflow's `text.BertTokenizer`.

In [67]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
import pandas as pd
import tensorflow as tf
import tensorflow_text as text
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
import config

## Dataset
Load the source dataset from XXX and split into a training and validation set.

In [98]:
df = pd.read_csv(config.TRAIN_PATH, sep="\t", names=["eng", "spa"], usecols=[0, 1])
n_samples =  df.shape[0]
dataset = tf.data.Dataset.from_tensor_slices(df).reshape(())
dataset = dataset.shuffle(config.BUFFER_SIZE)
val_size = int(n_samples * config.VALIDATION_SHARE)
val_dataset = dataset.take(val_size)
train_dataset = dataset.skip(val_size)

In [99]:
for eng, spa in train_dataset.take(1):
    print(f"English: {eng.numpy().decode('utf-8')}")
    print(f"Spanish: {spa.numpy().decode('utf-8')}")

English: Keep track of everything that looks promising.
Spanish: Lleve un registro de todo lo que parezca prometedor.


## Vocabulary
Generate the vocabularies, one for English and one for Spanish

In [100]:
train_eng = train_dataset.map(lambda x: x[0])
train_spa = train_dataset.map(lambda x: x[1])

In [101]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [102]:
bert_vocab_args = dict(
    vocab_size = config.VOCAB_SIZE,
    reserved_tokens=config.RESERVED_TOKENS,
    bert_tokenizer_params=config.BERT_TOKENIZER_PARAMS,
    learn_params={},
)
eng_vocab = bert_vocab.bert_vocab_from_dataset(
    train_eng.batch(1000).prefetch(2),
    **bert_vocab_args
)
spa_vocab = bert_vocab.bert_vocab_from_dataset(
    train_spa.batch(1000).prefetch(2),
    **bert_vocab_args
)

In [103]:
print(eng_vocab[:10])
print(eng_vocab[100:110])
print(eng_vocab[1000:1010])
print(eng_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', '%', "'", '(']
['how', 'her', 'time', 'has', 'as', 'did', 'very', 'about', 'all', 'will']
['eats', 'hiding', 'inside', 'offer', 'shopping', 'supposed', 'teach', 'boss', 'church', 'deal']
['##:', '##;', '##?', '##j', '##q', '##v', '##z', '##°', '##’', '##€']


In [104]:
eng_vocab_path = config.DATA_DIR + '/eng_vocab.txt'
spa_vocab_path = config.DATA_DIR + '/spa_vocab.txt'
write_vocab_file(eng_vocab_path, eng_vocab)
write_vocab_file(spa_vocab_path, spa_vocab)

## Tokenizer
Build the tokenizers

In [105]:
eng_tokenizer = text.BertTokenizer(eng_vocab_path, **config.BERT_TOKENIZER_PARAMS)
spa_tokenizer = text.BertTokenizer(spa_vocab_path, **config.BERT_TOKENIZER_PARAMS)

2023-11-26 16:18:44.917909: I tensorflow/core/kernels/lookup_util.cc:414] Table trying to initialize from file data/eng_vocab.txt is already initialized.
2023-11-26 16:18:44.926815: I tensorflow/core/kernels/lookup_util.cc:414] Table trying to initialize from file data/spa_vocab.txt is already initialized.


In [108]:
for x in train_dataset.batch(3).take(1):
    print(x)

tf.Tensor(
[[b'Nothing is missing.' b'No falta nada.']
 [b"They're inseparable." b'Ellas son inseparables.']
 [b'Leave me.' b'D\xc3\xa9jame.']], shape=(3, 2), dtype=string)
