In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import tensorflow.keras.callbacks as callbacks
import tensorflow_datasets as tfds
import tensorflow_addons as tfa

import numpy as np
import matplotlib.pyplot as plt
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
print('tensorflow version', tf.version.VERSION)

1 Physical GPUs, 1 Logical GPUs
tensorflow version 2.3.2


In [2]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']
print(train_examples)
print(val_examples)

<DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.string)>
<DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.string)>


In [3]:
pt_text = []
en_text = []

for pt_example, en_example in train_examples:
  pt_text.append(pt_example.numpy().decode('utf-8'))
  en_text.append(en_example.numpy().decode('utf-8'))
  
print(pt_text[:3])
print(en_text[:3])

['e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .', 'mas e se estes fatores fossem ativos ?', 'mas eles não tinham a curiosidade de me testar .']
['and when you improve searchability , you actually take away the one advantage of print , which is serendipity .', 'but what if it were active ?', "but they did n't test for curiosity ."]


In [4]:
def train_tokenizer(text, vocab_size):
  tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
  trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[PAD]", "[UNK]", "[START]", "[END]"])
  tokenizer.pre_tokenizer = Whitespace()
  tokenizer.train_from_iterator(text, trainer)
  return tokenizer

pt_tokenizer = train_tokenizer(pt_text, 9000)
en_tokenizer = train_tokenizer(en_text, 9000)
pt_tokenizer.enable_padding()
en_tokenizer.enable_padding()
print('pt vocabs: ', pt_tokenizer.get_vocab_size())
print('en vocabs: ', en_tokenizer.get_vocab_size())

pt vocabs:  9000
en vocabs:  9000


In [5]:
BUFFER_SIZE = 20000
BATCH_SIZE = 32

def encode_text(pt, en):
  pt_text = ['[START] ' + p.decode('utf-8') + ' [END]' for p in pt.numpy()]
  en_text = ['[START] ' + e.decode('utf-8') + ' [END]' for e in en.numpy()]
  pt_outputs = pt_tokenizer.encode_batch(pt_text)
  en_outputs = en_tokenizer.encode_batch(en_text)
  pt_ids = [p.ids for p in pt_outputs]
  en_ids = [e.ids for e in en_outputs]
  return pt_ids, en_ids

def tokenization(pt, en):
  encoded = tf.py_function(func=encode_text, inp=[pt, en], Tout=[tf.int32, tf.int32])
  decoder_input = encoded[1][:, :-1]
  decoder_output = encoded[1][:, 1:]
  return (encoded[0], decoder_input), decoder_output


train_ds = (train_examples
            .cache()
            .shuffle(BUFFER_SIZE)
            .batch(BATCH_SIZE)
            .map(tokenization)
            .prefetch(tf.data.experimental.AUTOTUNE))


for (encoder_inputs, decoder_inputs), decoder_outputs in train_ds.take(1):
  print(encoder_inputs)
  print(decoder_inputs)
  print(decoder_outputs)

tf.Tensor(
[[   2 1873 1676 ...    0    0    0]
 [   2 2668   54 ...    0    0    0]
 [   2  186  695 ...    0    0    0]
 ...
 [   2   44  203 ...    0    0    0]
 [   2  425   14 ...    0    0    0]
 [   2  302  472 ...    0    0    0]], shape=(32, 70), dtype=int32)
tf.Tensor(
[[   2 2894  106 ...    0    0    0]
 [   2  126  289 ...    0    0    0]
 [   2  164  157 ...    0    0    0]
 ...
 [   2  126  223 ...    0    0    0]
 [   2   99  320 ...    0    0    0]
 [   2  102  154 ...    0    0    0]], shape=(32, 71), dtype=int32)
tf.Tensor(
[[2894  106  257 ...    0    0    0]
 [ 126  289    9 ...    0    0    0]
 [ 164  157  777 ...    0    0    0]
 ...
 [ 126  223  146 ...    0    0    0]
 [  99  320   90 ...    0    0    0]
 [ 102  154  126 ...    0    0    0]], shape=(32, 71), dtype=int32)


In [6]:
max_length = 0

for (encoder_inputs, decoder_inputs), decoder_outputs in train_ds:
  max_length = max(max_length, encoder_inputs.shape[1])
  max_length = max(max_length, decoder_inputs.shape[1])
  max_length = max(max_length, decoder_outputs.shape[1])

print('max length: ', max_length)

max length:  245


In [7]:
def positional_encoding(position, size):
  pos = tf.range(position)[:, tf.newaxis]
  i = tf.range(size)[tf.newaxis, :]
  ii = tf.cast((i / 2) * 2, tf.float32)
  angle_rads = 1 / tf.pow(10000, ii / size)
  angle_rads = tf.cast(pos, tf.float32) * angle_rads

  # apply sin to even indices in the array; 2i
  sin = tf.sin(angle_rads[:, 0::2])
  # apply cos to odd indices in the array; 2i+1
  cos = tf.cos(angle_rads[:, 1::2])
  pos_encoding = tf.concat([sin, cos], axis=-1)
  pos_encoding = pos_encoding[np.newaxis, ...]
  return tf.cast(pos_encoding, dtype=tf.float32)

pos_en = positional_encoding(100, 128)
print(pos_en)

tf.Tensor(
[[[ 0.          0.          0.         ...  1.          1.
    1.        ]
  [ 0.841471    0.7617204   0.68156135 ...  1.          1.
    1.        ]
  [ 0.9092974   0.98704624  0.99748    ...  0.99999994  0.99999994
    1.        ]
  ...
  [ 0.37960774  0.7341857  -0.4645332  ...  0.9999034   0.9999276
    0.9999457 ]
  [-0.57338184 -0.0414884  -0.94348747 ...  0.9999014   0.99992603
    0.99994457]
  [-0.99920684 -0.78794664 -0.9162827  ...  0.9998994   0.99992454
    0.99994344]]], shape=(1, 100, 128), dtype=float32)


In [8]:
def create_padding_mask(x, key_size):
  seq = tf.cast(tf.logical_not(tf.equal(x, 0)), tf.float32)
  seq = seq[:, tf.newaxis, :]
  return tf.transpose(tf.tile(seq, [1, key_size, 1]), [0, 2, 1])

def create_look_ahead_mask(x_length, y_length):
  mask = tf.linalg.band_part(tf.ones((x_length, y_length)), -1, 0)
  return mask

def create_masks(x, y):
  x_length = tf.shape(x)[1]
  y_length = tf.shape(y)[1]
  encoder_padding_mask = create_padding_mask(x, x_length)
  decoder_padding_mask1 = create_padding_mask(y, y_length)
  decoder_padding_mask2 = create_padding_mask(y, x_length)
  look_ahead_mask = create_look_ahead_mask(y_length, y_length)
  combined_mask = tf.minimum(decoder_padding_mask1, look_ahead_mask)
  return encoder_padding_mask, combined_mask, decoder_padding_mask2


for (encoder_inputs, decoder_inputs), decoder_outputs in train_ds.take(1):
  encoder_padding_mask, combined_mask, decoder_padding_mask = create_masks(encoder_inputs, decoder_inputs)
  print('encoder_padding mask: ', encoder_padding_mask.shape)
  print('look ahead mask: ', combined_mask.shape)
  print('decoder_padding mask:', decoder_padding_mask.shape)
  print('look ahead mask: ', combined_mask)

encoder_padding mask:  (32, 53, 53)
look ahead mask:  (32, 56, 56)
decoder_padding mask: (32, 56, 53)
look ahead mask:  tf.Tensor(
[[[1. 0. 0. ... 0. 0. 0.]
  [1. 1. 0. ... 0. 0. 0.]
  [1. 1. 1. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 1. 0. ... 0. 0. 0.]
  [1. 1. 1. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 1. 0. ... 0. 0. 0.]
  [1. 1. 1. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 1. 0. ... 0. 0. 0.]
  [1. 1. 1. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 1. 0. ... 0. 0. 0.]
  [1. 1. 1. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 1. 0. ... 0

In [9]:
EMBEDDING_SIZE = 128
MAX_LENGTH = 250
NUM_HEADS = 8
NUM_LAYERS = 4
DENSE_OUTPUT = 512
DROPOUT_RATE = 0.1

def embedding(x, vocab_size):
  length = tf.shape(x)[1]
  position = tf.range(start=0, limit=length, delta=1)
  em = layers.Embedding(vocab_size, EMBEDDING_SIZE)(x)
  pos_em = layers.Embedding(MAX_LENGTH, EMBEDDING_SIZE)(position)
  return em + pos_em

def encoder(x, mask):
  m = tfa.layers.MultiHeadAttention(EMBEDDING_SIZE, NUM_HEADS)([x, x, x], mask=mask)
  d = layers.Dropout(DROPOUT_RATE)(m)
  n = layers.LayerNormalization(epsilon=1e-6)(d + x)
  x = layers.Dense(DENSE_OUTPUT, activation='relu')(n)
  x = layers.Dense(EMBEDDING_SIZE)(x)
  d = layers.Dropout(DROPOUT_RATE)(x)
  x = layers.LayerNormalization(epsilon=1e-6)(d + n)
  return x

def decoder(k, v, q, look_ahead_mask, padding_mask):
  m = tfa.layers.MultiHeadAttention(EMBEDDING_SIZE, NUM_HEADS)([q, q, q], mask=look_ahead_mask)
  d = layers.Dropout(DROPOUT_RATE)(m)
  n = layers.LayerNormalization(epsilon=1e-6)(d + q)
  
  m = tfa.layers.MultiHeadAttention(EMBEDDING_SIZE, NUM_HEADS)([n, k, v], mask=padding_mask)
  d = layers.Dropout(DROPOUT_RATE)(m)
  n = layers.LayerNormalization(epsilon=1e-6)(d + n)
  
  x = layers.Dense(DENSE_OUTPUT, activation='relu')(n)
  x = layers.Dense(EMBEDDING_SIZE)(x)
  d = layers.Dropout(DROPOUT_RATE)(x)
  x = layers.LayerNormalization(epsilon=1e-6)(d + n)
  return x


def create_model():
  encoder_inputs = layers.Input(shape=[None], dtype=tf.int32)
  decoder_inputs = layers.Input(shape=[None], dtype=tf.int32)
  encoder_mask, look_ahead_mask, decoder_padding_mask = create_masks(encoder_inputs, decoder_inputs)
  e = embedding(encoder_inputs, pt_tokenizer.get_vocab_size())
  d = embedding(decoder_inputs, en_tokenizer.get_vocab_size())
  for _ in range(NUM_LAYERS):
    e = encoder(e, encoder_mask)
  for _ in range(NUM_LAYERS):
    d = decoder(e, e, d, look_ahead_mask, decoder_padding_mask)
  outputs = layers.Dense(en_tokenizer.get_vocab_size(), activation='softmax')(d)
  model = keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[outputs])
  return model


keras.backend.clear_session()
model = create_model()
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
tf_op_layer_Equal (TensorFlowOp [(None, None)]       0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_Shape_2 (TensorFlow [(2,)]               0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_LogicalNot (TensorF [(None, None)]       0           tf_op_layer_Equal[0][0]          
_______________________________________________________________________________________

In [10]:
def loss_fn(y_true, y_pred):
  mask = tf.logical_not(tf.equal(y_true, 0))
  mask = tf.cast(mask, dtype=tf.float32)
  loss = keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
  return tf.reduce_sum(loss * mask) / tf.reduce_sum(mask)

def accuracy_fn(real, pred):
  accuracies = tf.equal(tf.cast(real, tf.int64), tf.argmax(pred, axis=2))
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  accuracies = tf.math.logical_and(mask, accuracies)

  accuracies = tf.cast(accuracies, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)


for (encoder_inputs, decoder_inputs), decoder_outputs in train_ds.take(1):
  print('encoder_inputs: ', encoder_inputs.shape)
  print('decoder_inputs: ', decoder_inputs.shape)
  inputs = (tf.convert_to_tensor(encoder_inputs), tf.convert_to_tensor(decoder_inputs))
  prediction = model.predict(inputs)
  print('prediction: ', prediction.shape)
  decoder_pred = model(inputs)
  print('prediction tensor: ', decoder_pred.shape)
  print('decoder_outputs: ', decoder_outputs.shape)
  print('softmax result: ', tf.argmax(decoder_pred, axis=-1))
  loss = loss_fn(decoder_outputs, decoder_pred)
  print(decoder_pred)
  print('loss: ', loss)
  acc = accuracy_fn(decoder_outputs, decoder_pred)
  print('accuracy: ', acc)

encoder_inputs:  (32, 97)
decoder_inputs:  (32, 101)
prediction:  (32, 101, 9000)
prediction tensor:  (32, 101, 9000)
decoder_outputs:  (32, 101)
softmax result:  tf.Tensor(
[[8052 4154 4154 ... 7792 6967 8639]
 [1981 7393 4154 ... 3718 1550 2657]
 [1981 4154 4154 ... 3718 1550 2657]
 ...
 [1981 4154 4154 ... 3718 1550 2657]
 [1981 4154 4154 ... 3718 1550 2657]
 [8888 7393 4154 ... 3718 1550 2657]], shape=(32, 101), dtype=int64)
tf.Tensor(
[[[1.17449265e-04 8.14972445e-05 2.03551215e-04 ... 1.74050801e-04
   7.88674079e-05 9.99307013e-05]
  [1.19461562e-04 9.45702195e-05 1.53504763e-04 ... 1.51525310e-04
   7.99787304e-05 1.06491425e-04]
  [1.39074415e-04 9.08720613e-05 1.74228247e-04 ... 1.39343159e-04
   8.47972769e-05 1.20650722e-04]
  ...
  [1.36129733e-04 1.03933045e-04 1.43667363e-04 ... 1.33304682e-04
   9.47814115e-05 1.46123100e-04]
  [1.26654253e-04 8.41241781e-05 1.44574980e-04 ... 1.35737471e-04
   8.14384839e-05 1.32193993e-04]
  [1.25492661e-04 8.03852672e-05 1.03422055e-

In [11]:
model.compile(optimizer=keras.optimizers.Adam(1e-4),
              loss=loss_fn, metrics=[accuracy_fn])

In [12]:
def scheduler(epoch, lr):
  if epoch == 0:
    return 1e-4
  if epoch == 10:
    return 3e-5
  return lr

EPOCHS = 20

tensorboard_callback = callbacks.TensorBoard(log_dir='translation_logs')
schedule_callback = callbacks.LearningRateScheduler(scheduler, verbose=True)

history = model.fit(train_ds, epochs=EPOCHS, callbacks=[schedule_callback, tensorboard_callback])


Epoch 00001: LearningRateScheduler reducing learning rate to 0.0001.
Epoch 1/20
Instructions for updating:
use `tf.profiler.experimental.stop` instead.


Instructions for updating:
use `tf.profiler.experimental.stop` instead.



Epoch 00002: LearningRateScheduler reducing learning rate to 9.999999747378752e-05.
Epoch 2/20

Epoch 00003: LearningRateScheduler reducing learning rate to 9.999999747378752e-05.
Epoch 3/20

Epoch 00004: LearningRateScheduler reducing learning rate to 9.999999747378752e-05.
Epoch 4/20

Epoch 00005: LearningRateScheduler reducing learning rate to 9.999999747378752e-05.
Epoch 5/20

Epoch 00006: LearningRateScheduler reducing learning rate to 9.999999747378752e-05.
Epoch 6/20

Epoch 00007: LearningRateScheduler reducing learning rate to 9.999999747378752e-05.
Epoch 7/20

Epoch 00008: LearningRateScheduler reducing learning rate to 9.999999747378752e-05.
Epoch 8/20

Epoch 00009: LearningRateScheduler reducing learning rate to 9.999999747378752e-05.
Epoch 9/20

Epoch 00010: LearningRateScheduler reducing learning rate to 9.999999747378752e-05.
Epoch 10/20

Epoch 00011: LearningRateScheduler reducing learning rate to 3e-05.
Epoch 11/20

Epoch 00012: LearningRateScheduler reducing learning 

In [14]:
def translate(pt):
  pt_encoded = pt_tokenizer.encode('[START] ' + pt + ' [END]')
  encoder_inputs = [pt_encoded.ids]
  decoder_inputs = [en_tokenizer.encode('[START]').ids]
  stop_token = en_tokenizer.encode('[END]').ids[0]
  print(encoder_inputs)
  print(decoder_inputs)
  for i in range(MAX_LENGTH-1):
    encoder_input_tensor = tf.convert_to_tensor(encoder_inputs)
    decoder_input_tensor = tf.convert_to_tensor(decoder_inputs)
    p = model((encoder_input_tensor, decoder_input_tensor), training=False)
    token = np.argmax(p[0, -1, :])
    decoder_inputs[0].append(token)
    if token == stop_token:
      break
  print(decoder_inputs[0])
  return en_tokenizer.decode(decoder_inputs[0])


pt = 'e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .'
en = translate(pt)
print(en)

[[2, 44, 284, 458, 193, 40, 1864, 14, 5841, 40, 1233, 4658, 136, 4001, 14, 130, 86, 40, 226, 147, 160, 321, 206, 16, 3]]
[[2]]
[2, 99, 255, 117, 634, 292, 93, 90, 868, 13, 117, 428, 90, 419, 3891, 106, 90, 6582, 13, 331, 97, 90, 158, 120, 92, 163, 1659, 15, 3]
and when you better look at the search , you take the only advantage of the impression , which is the un be re di vor .


In [15]:
model.save('translation.h5')

In [16]:
import tensorflowjs as tfjs

tfjs.converters.save_keras_model(model, 'translation.tfjs')

  return h5py.File(h5file)


In [21]:
import json

with open('translation.tfjs/vocab.json', 'w') as f:
  vocabs = {
    'en': en_tokenizer.get_vocab(),
    'pt': pt_tokenizer.get_vocab(),
  }
  json.dump(vocabs, f)