## Neural Machine Translation with LSTM

In [None]:
import numpy as np
import tensorflow as tf
from pathlib import Path

In [2]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets", extract=True)

In [3]:
text = (Path(path).with_name("spa-eng")/"spa.txt").read_text()

In [4]:
text = text.replace('¡','').replace('¿','')

In [5]:
text[:10]

'Go.\tVe.\nGo'

In [6]:
# Roughly 1.5M words
len(text)/5

1580384.4

In [7]:
pairs = [line.split('\t') for line in text.splitlines()]

In [8]:
pairs[:10]

[['Go.', 'Ve.'],
 ['Go.', 'Vete.'],
 ['Go.', 'Vaya.'],
 ['Go.', 'Váyase.'],
 ['Hi.', 'Hola.'],
 ['Run!', 'Corre!'],
 ['Run.', 'Corred.'],
 ['Who?', 'Quién?'],
 ['Fire!', 'Fuego!'],
 ['Fire!', 'Incendio!']]

In [9]:
np.random.shuffle(pairs)

In [10]:
sentences_en, sentences_es = zip(*pairs)

In [11]:
vocab_size = 1000
max_length = 50

In [None]:
text_vec_layer_en = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)

In [13]:
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f'startofseq {s} endofseq' for s in sentences_es])

In [14]:
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [15]:
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [16]:
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])

X_train_dec = tf.constant([f'startofseq {s}' for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f'startofseq {s}' for s in sentences_es[100_000:]])

Y_train = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[100_000:]])

In [17]:
Y_train = tf.cast(Y_train, tf.float32)
Y_valid = tf.cast(Y_valid, tf.float32)

In [18]:
Y_train

<tf.Tensor: shape=(100000, 50), dtype=float32, numpy=
array([[549., 229., 206., ...,   0.,   0.,   0.],
       [ 16.,  25.,   7., ...,   0.,   0.,   0.],
       [  7., 617.,  52., ...,   0.,   0.,   0.],
       ...,
       [  7., 426.,   1., ...,   0.,   0.,   0.],
       [  7.,   1.,   1., ...,   0.,   0.,   0.],
       [ 37.,   1.,   6., ...,   0.,   0.,   0.]], dtype=float32)>

In [19]:
Y_valid

<tf.Tensor: shape=(18964, 50), dtype=float32, numpy=
array([[100.,  22., 714., ...,   0.,   0.,   0.],
       [  1.,   6.,   1., ...,   0.,   0.,   0.],
       [  8., 104.,   1., ...,   0.,   0.,   0.],
       ...,
       [ 20.,  15., 368., ...,   0.,   0.,   0.],
       [ 20.,  60., 488., ...,   0.,   0.,   0.],
       [ 80.,   1.,  19., ...,   0.,   0.,   0.]], dtype=float32)>

# Defining the model with funtional API, because it is not sequential

In [20]:
embed_size = 128

In [21]:
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [22]:
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

encoder_input_ids = tf.cast(encoder_input_ids, tf.float32)
decoder_input_ids = tf.cast(decoder_input_ids, tf.float32)

In [23]:
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

In [24]:
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [25]:
encoder = tf.keras.layers.LSTM(512, return_state=True)

# In Python a, *b = [1, 2, 3, 4] => a=1, b=[2, 3, 4]
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [26]:
encoder_outputs

<KerasTensor: shape=(None, 512) dtype=float32 (created by layer 'lstm')>

In [27]:
encoder_state

[<KerasTensor: shape=(None, 512) dtype=float32 (created by layer 'lstm')>,
 <KerasTensor: shape=(None, 512) dtype=float32 (created by layer 'lstm')>]

In [28]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [29]:
decoder_outputs

<KerasTensor: shape=(None, 50, 512) dtype=float32 (created by layer 'lstm_1')>

In [30]:
output_layer = tf.keras.layers.Dense(vocab_size, activation='softmax', dtype=tf.float32)

In [31]:
output_layer

<keras.src.layers.core.dense.Dense at 0x7fd1ae572460>

In [32]:
Y_proba = output_layer(decoder_outputs)

In [33]:
#with tf.device("/GPU:0"):
model = tf.keras.models.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])

In [36]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])#, jit_compile=True)

In [None]:
model.fit((X_train, X_train_dec), Y_train, validation_data=((X_valid, X_valid_dec), Y_valid), epochs=10) 

In [None]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])
        X_dec = np.array(["startofseq " + translation])
        y_proba = model.predict((X, X_dec))[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == 'endofseq':
            break
        translation += " " + predicted_word
    return translation.strip()

In [None]:
translate("I like soccer")