# Dataset Tokenization

## Imports

In [3]:
import sys
import os

import tensorflow_datasets as tfds
import tensorflow as tf

root_path = os.path.abspath(os.path.join('..'))
if root_path not in sys.path:
    sys.path.append(root_path)

import src.utils.byte_pair_encoding_tokenizer as bpe 

## Load dataset

In [16]:
dataset, info = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
train_dataset, val_dataset = dataset['train'], dataset['validation']

## Prepare the tokenizer

In [17]:
tokenizer = bpe.CustomBPETokenizer(["[PAD]", "[UNK]", "[START]", "[END]"], "bpe_tokenizers/ted_hrlr_translate_pt_to_en")

## Tokenize and save dataset 

In [52]:
def tokenize_dataset(dataset, tokenizer, log, N):
    en_tokenized_list = []
    pt_tokenized_list = []

    # Iterate through the dataset
    count = 0
    for pt, en in dataset:
        # Convert tensors to strings
        pt_str = pt.numpy().decode('utf-8')
        en_str = en.numpy().decode('utf-8')
        
        # Tokenize
        pt_tokenized = tokenizer.tokenize([pt_str])
        en_tokenized = tokenizer.tokenize([en_str])
        
        pt_tokenized_list.append(pt_tokenized[0])
        en_tokenized_list.append(en_tokenized[0])

        count += 1
        if count % 100 == 0:
            if log:
                print(f"Tokenized first {count} examples...")
        if N is not None and count == N:
            if log:
                print(f"Early exit. Tokenized first {count} examples.")
            break

    return tf.ragged.stack(pt_tokenized_list), tf.ragged.stack(en_tokenized_list)

def tokenize_and_save_dataset(dataset, tokenizer, path, log=True, N=None):
    pt_tokenized, en_tokenized = tokenize_dataset(dataset, tokenizer, log, N)

    pt_tokenized_dataset = tf.data.Dataset.from_tensor_slices(tf.concat(pt_tokenized, axis=0))
    en_tokenized_dataset = tf.data.Dataset.from_tensor_slices(tf.concat(en_tokenized, axis=0))

    combined_dataset = tf.data.Dataset.zip((pt_tokenized_dataset, en_tokenized_dataset))

    combined_dataset.save(path)

In [45]:
TRAIN_DATASET_PATH =  "./tokenized_data/train"
VAL_DATASET_PATH = "./tokenized_data/val"

In [47]:
tokenize_and_save_dataset(train_dataset, tokenizer, MAX_LENGTH, TRAIN_DATASET_PATH, True)

Tokenized first 100 examples...
Tokenized first 200 examples...
Tokenized first 300 examples...
Tokenized first 400 examples...
Tokenized first 500 examples...
Tokenized first 600 examples...
Tokenized first 700 examples...
Tokenized first 800 examples...
Tokenized first 900 examples...
Tokenized first 1000 examples...
Tokenized first 1100 examples...
Tokenized first 1200 examples...
Tokenized first 1300 examples...
Tokenized first 1400 examples...
Tokenized first 1500 examples...
Tokenized first 1600 examples...
Tokenized first 1700 examples...
Tokenized first 1800 examples...
Tokenized first 1900 examples...
Tokenized first 2000 examples...
Tokenized first 2100 examples...
Tokenized first 2200 examples...
Tokenized first 2300 examples...
Tokenized first 2400 examples...
Tokenized first 2500 examples...
Tokenized first 2600 examples...
Tokenized first 2700 examples...
Tokenized first 2800 examples...
Tokenized first 2900 examples...
Tokenized first 3000 examples...
Tokenized first 310

In [36]:
tokenize_and_save_dataset(val_dataset, tokenizer, MAX_LENGTH, VAL_DATASET_PATH, True)

Tokenized first 100 examples...
Tokenized first 200 examples...
Tokenized first 300 examples...
Tokenized first 400 examples...
Tokenized first 500 examples...
Tokenized first 600 examples...
Tokenized first 700 examples...
Tokenized first 800 examples...
Tokenized first 900 examples...
Tokenized first 1000 examples...
Tokenized first 1100 examples...


## Load the new dataset

In [48]:
loaded_train_dataset = tf.data.Dataset.load(TRAIN_DATASET_PATH)

In [49]:
loaded_val_dataset = tf.data.Dataset.load(VAL_DATASET_PATH)

## Print example

In [50]:
for pt, en in loaded_train_dataset.take(1):
    print(pt)
    print(en)

tf.Tensor(
[   2   44  553 1021  285  120 3874  122 2698  285  120 2429 5629 1016
  252 7567  122  169  211  120  342  355  544  376  100   16    3], shape=(27,), dtype=int32)
tf.Tensor(
[   2  198  537  209 5685 4527 3361  122  209  695  919 1769  150  420
 7451  736  178 5529  122  673  186  342  355  544  407  100   16    3], shape=(28,), dtype=int32)


In [51]:
for pt, en in loaded_val_dataset.take(1):
    print(pt)
    print(en)

tf.Tensor([   2   59 1881  180  490 5563   44  180 2364 4364  343 2084  234    3], shape=(14,), dtype=int32)
tf.Tensor([   2   43  181  316 3672 5135  182 4124   58  234    3], shape=(11,), dtype=int32)
