## NMT with Transformer model

In [None]:
import numpy as np
import tensorflow as tf
from pathlib import Path
import matplotlib.pyplot as plt

In [2]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets", extract=True)

In [3]:
text = (Path(path).with_name("spa-eng")/"spa.txt").read_text()

In [4]:
Path(path)

PosixPath('/tmp/.keras/datasets/spa-eng.zip')

In [5]:
text = text.replace('¡','').replace('¿','')

In [6]:
text[:10]

'Go.\tVe.\nGo'

In [7]:
# Roughly 1.5M words
len(text)/5

1580384.4

In [8]:
# Translation pairs are separated by tabs
pairs = [line.split('\t') for line in text.splitlines()]

In [9]:
pairs[:10]

[['Go.', 'Ve.'],
 ['Go.', 'Vete.'],
 ['Go.', 'Vaya.'],
 ['Go.', 'Váyase.'],
 ['Hi.', 'Hola.'],
 ['Run!', 'Corre!'],
 ['Run.', 'Corred.'],
 ['Who?', 'Quién?'],
 ['Fire!', 'Fuego!'],
 ['Fire!', 'Incendio!']]

In [10]:
# Inplace shuffling the whole dataset
np.random.shuffle(pairs)

In [11]:
sentences_en, sentences_es = zip(*pairs)

In [12]:
# Embedding based on the first 1000 words
vocab_size = 1000

# Max sentence length counted in tokes, so whole words here
max_length = 50

# Ebedding dimension
embed_size = 128

## Text tokenization

In [None]:
text_vec_layer_en = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)

In [14]:
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f'startofseq {s} endofseq' for s in sentences_es])

In [15]:
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [16]:
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [17]:
sentences_en[99999:100_000]

("Tom held a knife to Mary's throat.",)

## Dataset split

In [18]:
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])

# Data shifter by one token for teacher forcing
X_train_dec = tf.constant([f'startofseq {s}' for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f'startofseq {s}' for s in sentences_es[100_000:]])

# The last predicted token must indicate sentence end
Y_train = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[100_000:]])

In [19]:
X_train_dec[-2:]

<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'startofseq Es \xc3\xa9l japon\xc3\xa9s?',
       b'startofseq Tom le puso un cuchillo en la garganta a Mary.'],
      dtype=object)>

In [20]:
Y_train = tf.cast(Y_train, tf.float32)
Y_valid = tf.cast(Y_valid, tf.float32)

In [21]:
Y_train

<tf.Tensor: shape=(100000, 50), dtype=float32, numpy=
array([[  1.,   6.,   9., ...,   0.,   0.,   0.],
       [ 28.,  86.,   1., ...,   0.,   0.,   0.],
       [ 16.,  25.,  28., ...,   0.,   0.,   0.],
       ...,
       [ 16.,  25.,  28., ...,   0.,   0.,   0.],
       [ 12.,  44., 593., ...,   0.,   0.,   0.],
       [  8.,  26., 299., ...,   0.,   0.,   0.]], dtype=float32)>

In [22]:
Y_valid

<tf.Tensor: shape=(18964, 50), dtype=float32, numpy=
array([[ 27., 665.,  18., ...,   0.,   0.,   0.],
       [ 37.,   1.,   3., ...,   0.,   0.,   0.],
       [ 14.,  72., 940., ...,   0.,   0.,   0.],
       ...,
       [ 25., 103.,   8., ...,   0.,   0.,   0.],
       [ 20.,  15.,   1., ...,   0.,   0.,   0.],
       [  1.,  43., 831., ...,   0.,   0.,   0.]], dtype=float32)>

## Embedding and tokenizing inputs

In [23]:
# Encoder and decoder inputs
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [24]:
# Encoder and decoder inputs tokenization
# At this point tokenizers are already adapted above
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

# Casting to float32 for consistency
encoder_input_ids = tf.cast(encoder_input_ids, tf.float32)
decoder_input_ids = tf.cast(decoder_input_ids, tf.float32)

In [25]:
# Encoder and decoder tokenized inputs embedding in embed_size dimensional space
# Maskings zeros ignores contribution from padding zeros to the loss
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

In [26]:
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [27]:
batch_max_len_dec = tf.shape(decoder_embeddings)[1]

### Learnable positional encoding example

In [28]:
#pos_embed_layer = tf.keras.layers.Embedding(max_length, embed_size)

#batch_max_len_enc = tf.shape(encoder_embeddings)[1]
#encoder_in = encoder_embeddings + pos_embed_layer(tf.range(batch_max_len_enc))

#batch_max_len_dec = tf.shape(decoder_embeddings)[1]
#decoder_in = decoder_embeddings + pos_embed_layer(tf.range(batch_max_len_dec))

## Fixed positional encoing

### Meshgrid allows fast vectorized evaluations on grids

In [29]:
aux = np.meshgrid(np.arange(4), 3.14 * np.arange(3))
aux

[array([[0, 1, 2, 3],
        [0, 1, 2, 3],
        [0, 1, 2, 3]]),
 array([[0.  , 0.  , 0.  , 0.  ],
        [3.14, 3.14, 3.14, 3.14],
        [6.28, 6.28, 6.28, 6.28]])]

In [30]:
np.array(2 ** aux[0] + 3 ** aux[1]).shape

(3, 4)

In [31]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_length, embed_size, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        assert embed_size % 2 == 0, "embed_size must be even"
        p, i = np.meshgrid(np.arange(max_length),
                           2 * np.arange(embed_size // 2))
        pos_emb = np.empty((1, max_length, embed_size))
        pos_emb[0, :, ::2] = np.sin(p / 10_000 ** (i / embed_size)).T
        # Here we use the same values for i as above, since for odd embedding posiitons
        # we use (i-1) as the exponent value, which evaluates to the value i of the even case above
        pos_emb[0, :, 1::2] = np.cos(p / 10_000 ** (i / embed_size)).T
        self.pos_encodings = tf.constant(pos_emb.astype(self.dtype))
        self.supports_masking = True

    def call(self, inputs):
        batch_max_length = tf.shape(inputs)[1]
        return inputs + self.pos_encodings[:, :batch_max_length]
        

In [32]:
pos_embed_layer = PositionalEncoding(max_length, embed_size)

In [33]:
pos_embed_layer

<__main__.PositionalEncoding at 0x7fde3c544d00>

In [34]:
encoder_in = pos_embed_layer(encoder_embeddings)
decoder_in = pos_embed_layer(decoder_embeddings)

In [35]:
encoder_in

<KerasTensor: shape=(None, 50, 128) dtype=float32 (created by layer 'positional_encoding')>

In [36]:
N = 2
num_heads = 8
dropout_rate = 0.1
n_unit = 128

In [37]:
encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]

In [38]:
encoder_pad_mask

<KerasTensor: shape=(None, 1, 50) dtype=bool (created by layer 'tf.__operators__.getitem_1')>

In [39]:
# Input data for the first encoder skip connection
Z = encoder_in

In [40]:
for _ in range(N):
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    Z = attn_layer(Z, value=Z, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    Z = tf.keras.layers.Dense(n_unit, activation="relu")(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.Dropout(dropout_rate)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    
    

In [41]:
Z

<KerasTensor: shape=(None, 50, 128) dtype=float32 (created by layer 'layer_normalization_3')>

## Decoder

In [42]:
decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]

causal_mask = tf.linalg.band_part(
    tf.ones((batch_max_len_dec, batch_max_len_dec), tf.bool), -1, 0)

In [43]:
encoder_outputs = Z
Z = decoder_in

In [44]:
Z

<KerasTensor: shape=(None, 50, 128) dtype=float32 (created by layer 'positional_encoding')>

In [45]:
for _ in range(N):
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    Z = attn_layer(Z, value=Z, attention_mask=causal_mask & decoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    # Cross-Attenion: Query from decoder, Key and Value from Encoder
    Z = attn_layer(Z, value=encoder_outputs, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    Z = tf.keras.layers.Dense(n_unit, activation="relu")(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

In [46]:
Y_proba = tf.keras.layers.Dense(vocab_size, activation="softmax")(Z)

## The model

In [47]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])

In [48]:
model.compile(loss="sparse_categorical_crossentropy", 
              optimizer="nadam", 
              metrics=["accuracy"],)
             #jit_compile=True)

In [55]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 input_1 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 text_vectorization_1 (Text  (None, 50)                   0         ['input_2[0][0]']             
 Vectorization)                                                                                   
                                                                                                  
 text_vectorization (TextVe  (None, 50)                   0         ['input_1[0][0]']         

In [60]:
pred=model.predict((X_valid[:2], X_valid_dec[:2]))



## Training

In [None]:
model.fit((X_train, X_train_dec), Y_train, 
          epochs=2, 
          validation_data=((X_valid, X_valid_dec), Y_valid))

In [None]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])
        X_dec = np.array(["startofseq " + translation])
        y_proba = model.predict((X, X_dec))[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == 'endofseq':
            break
        translation += " " + predicted_word
    return translation.strip()

In [None]:
translate("I like soccer")

In [53]:
Y_train.shape

TensorShape([100000, 50])

In [64]:
 tf.losses.sparse_categorical_crossentropy(Y_train[:2], pred)

<tf.Tensor: shape=(2, 50), dtype=float32, numpy=
array([[7.2149835, 6.326355 , 6.5161777, 7.1277547, 6.1781445, 7.4830623,
        7.329293 , 6.813641 , 6.8545556, 6.78256  , 6.5896783, 6.4026375,
        6.3167057, 6.3564377, 6.6299744, 6.8215413, 6.781695 , 6.657982 ,
        6.5429077, 6.4739647, 6.7684364, 7.109344 , 7.1886196, 6.969151 ,
        6.8565392, 6.845389 , 6.9395986, 7.0488443, 7.0647254, 6.8975677,
        6.6062846, 6.380931 , 6.335762 , 6.5047264, 6.8638735, 7.0779147,
        7.061319 , 6.88671  , 6.7692747, 6.7698383, 6.745297 , 6.7895265,
        6.888773 , 7.0289087, 7.122264 , 7.1737127, 7.1263647, 6.975675 ,
        6.7918477, 6.71065  ],
       [5.9045415, 6.223027 , 7.0044456, 7.1401405, 7.2623553, 7.027407 ,
        6.9622273, 6.9365125, 6.854495 , 6.7824445, 6.5897593, 6.4019904,
        6.3160357, 6.3564224, 6.6300983, 6.8217583, 6.7822847, 6.65859  ,
        6.5435157, 6.4742656, 6.768617 , 7.109516 , 7.1889405, 6.969713 ,
        6.8568373, 6.845047 , 6.

In [65]:
Y_train[:2].shape

TensorShape([2, 50])

In [66]:
pred.shape

(2, 50, 1000)