# Training the Transformer

## Imports

In [3]:
import sys
import os

root_path = os.path.abspath(os.path.join('..'))
if root_path not in sys.path:
    sys.path.append(root_path)

import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow as tf

import tensorflow_text

## Data pipeline

### Load tokenized dataset

In [4]:
TRAIN_DATASET_PATH =  "./../tokenized_data/train"
VAL_DATASET_PATH = "./../tokenized_data/val"

train_dataset = tf.data.Dataset.load(TRAIN_DATASET_PATH)
val_dataset = tf.data.Dataset.load(VAL_DATASET_PATH)

In [5]:
for pt, en in train_dataset.take(1):
    print(pt)
    print(en)

tf.Tensor(
[   2   44  553 1021  285  120 3874  122 2698  285  120 2429 5629 1016
  252 7567  122  169  211  120  342  355  544  376  100   16    3], shape=(27,), dtype=int32)
tf.Tensor(
[   2  198  537  209 5685 4527 3361  122  209  695  919 1769  150  420
 7451  736  178 5529  122  673  186  342  355  544  407  100   16    3], shape=(28,), dtype=int32)


In [6]:
for pt, en in val_dataset.take(1):
    print(pt)
    print(en)

tf.Tensor([   2   59 1881  180  490 5563   44  180 2364 4364  343 2084  234    3], shape=(14,), dtype=int32)
tf.Tensor([   2   43  181  316 3672 5135  182 4124   58  234    3], shape=(11,), dtype=int32)


### Prepare the train and validation datasets

In [7]:
MAX_TOKENS=128

def prepare_batch(pt, en):
    pt = pt[:, :MAX_TOKENS]
    pt = pt.to_tensor() 

    en = en[:, :(MAX_TOKENS+1)]
    en_inputs = en[:, :-1].to_tensor()  # Drop the [END] tokens
    en_labels = en[:, 1:].to_tensor()   # Drop the [START] tokens

    return (pt, en_inputs), en_labels

In [8]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

In [9]:
train_batches = make_batches(train_dataset)
val_batches = make_batches(val_dataset)

## Initialize the model

In [11]:
# Hyperparameters
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [12]:
from src.models.transformer import Transformer
import src.utils.byte_pair_encoding_tokenizer as bpe


tokenizer = bpe.CustomBPETokenizer(["[PAD]", "[UNK]", "[START]", "[END]"], "../bpe_tokenizers/ted_hrlr_translate_pt_to_en")

transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    d_ff=dff,
    vocab_size=0,
    input_vocab_size=tokenizer.get_vocab_size(),
    target_vocab_size=tokenizer.get_vocab_size(),
    dropout=dropout_rate)

## Training 

### Compile the model

In [13]:
from src.models.learning_rate_schedule import CustomLearningRateSchedule
from src.utils.masked_loss import masked_loss
from src.utils.masked_accuracy import masked_accuracy

learning_rate = CustomLearningRateSchedule(d_model=d_model, warmup_steps=4000)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

### Train the model

In [14]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [15]:
example_batch_inputs, example_batch_labels = next(iter(train_batches))

In [18]:
example_preds = transformer.predict(example_batch_inputs)



In [19]:
print("Shape of predictions:", example_preds.shape)

Shape of predictions: (64, 83, 8000)


In [20]:
transformer.fit(train_batches,
                epochs=3,
                validation_data=val_batches)

Epoch 1/3


2023-11-08 15:49:40.441173: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f1a30280de0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-11-08 15:49:40.441200: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1050 Ti with Max-Q Design, Compute Capability 6.1
2023-11-08 15:49:40.446768: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-08 15:49:40.536901: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f1af84cf250>

## Test the model

In [32]:
from src.utils.translator import Translator

translator = Translator(tokenizer, transformer)

def print_example(input, target):
    output = translator(input, 128)

    print(f"Input sentence: {input}")
    print(f"Target sentence: {target}")
    print(f"Translated setnece: {output.numpy().decode('utf-8')}")

In [40]:
input_sentence = 'esta é uma frase curta de exemplo'
target = 'this is a short example sentence'

print_example(input_sentence, target)

Input sentence: esta é uma frase curta de exemplo
Target sentence: this is a short example sentence
Translated setnece: this is a little bit of example of example .
