## Text translation with bidirectional RNN with Attention

In [None]:
import numpy as np
import tensorflow as tf
from pathlib import Path

In [2]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets", extract=True)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [3]:
!ls

Ch16-Char-RNN.ipynb		     Ch16-Sentiment_Masking_TBProjector.ipynb
Ch16_NLP_Sentient_TFHUB_Model.ipynb  logs
Ch16-NMT-BiDir-Attention.ipynb	     my_tfhub_dir
Ch16-NMT.ipynb			     shakespear_model


In [4]:
text = (Path(path).with_name("spa-eng")/"spa.txt").read_text()

In [5]:
Path(path)

PosixPath('/tmp/.keras/datasets/spa-eng.zip')

In [6]:
text = text.replace('¡','').replace('¿','')

In [7]:
text[:10]

'Go.\tVe.\nGo'

In [8]:
# Roughly 1.5M words
len(text)/5

1580384.4

In [9]:
pairs = [line.split('\t') for line in text.splitlines()]

In [10]:
pairs[:10]

[['Go.', 'Ve.'],
 ['Go.', 'Vete.'],
 ['Go.', 'Vaya.'],
 ['Go.', 'Váyase.'],
 ['Hi.', 'Hola.'],
 ['Run!', 'Corre!'],
 ['Run.', 'Corred.'],
 ['Who?', 'Quién?'],
 ['Fire!', 'Fuego!'],
 ['Fire!', 'Incendio!']]

In [11]:
np.random.shuffle(pairs)

In [12]:
sentences_en, sentences_es = zip(*pairs)

In [13]:
# Embedding based on the first 1000 words
vocab_size = 1000

# Max sentence length counted in tokes, so whole words here
max_length = 50

In [None]:
text_vec_layer_en = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)

In [15]:
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f'startofseq {s} endofseq' for s in sentences_es])

In [16]:
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [17]:
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [18]:
sentences_en[99999:100_000]

('When the curtain went up, the stage was dark.',)

In [19]:
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])

# Data shifter by one token for teacher forcing
X_train_dec = tf.constant([f'startofseq {s}' for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f'startofseq {s}' for s in sentences_es[100_000:]])

# The last predicted token must indicate sentence end
Y_train = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[100_000:]])

In [20]:
X_train_dec[-2:]

<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'startofseq Estoy seguro de que me dir\xc3\xa1s lo que necesito saber.',
       b'startofseq Al levantarse el tel\xc3\xb3n, la escena estaba oscura.'],
      dtype=object)>

In [21]:
Y_train = tf.cast(Y_train, tf.float32)
Y_valid = tf.cast(Y_valid, tf.float32)

In [22]:
Y_train

<tf.Tensor: shape=(100000, 50), dtype=float32, numpy=
array([[ 32.,   1.,   1., ...,   0.,   0.,   0.],
       [  7., 384., 178., ...,   0.,   0.,   0.],
       [  8., 202., 120., ...,   0.,   0.,   0.],
       ...,
       [  7., 177.,   1., ...,   0.,   0.,   0.],
       [ 37., 209.,   4., ...,   0.,   0.,   0.],
       [ 34.,   1.,  10., ...,   0.,   0.,   0.]], dtype=float32)>

In [23]:
Y_valid

<tf.Tensor: shape=(18964, 50), dtype=float32, numpy=
array([[ 14.,  61.,   9., ...,   0.,   0.,   0.],
       [  8.,   7.,   1., ...,   0.,   0.,   0.],
       [ 20., 108.,   1., ...,   0.,   0.,   0.],
       ...,
       [  4., 110.,  91., ...,   0.,   0.,   0.],
       [ 20.,  15.,   1., ...,   0.,   0.,   0.],
       [ 16.,  25.,  10., ...,   0.,   0.,   0.]], dtype=float32)>

## Defining the model with functional API, because it is not sequential

In [25]:
# Words embedding dimension
embed_size = 128

In [26]:
# Encoder and decoder inputs
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [27]:
# Encoder and decoder inputs tokenization
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

# Casting to float32 for consistency
encoder_input_ids = tf.cast(encoder_input_ids, tf.float32)
decoder_input_ids = tf.cast(decoder_input_ids, tf.float32)

In [28]:
# Encoder and decoder tokenized inputs embedding in embed_size dimensional space
# Maskings zeros ignores contribution from padding zeros to the loss
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

In [29]:
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

## Encoder: Bidirectional layer wrapping LSTM

In [30]:
# 256 not 512 neurons since we have two LSTMs here
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256,
                         return_sequences=True, # For attention we need the full encoder output
                         return_state=True # For encoding we use internal states of the LSTM
                        )
    )

# In Python a, *b = [1, 2, 3, 4] => a=1, b=[2, 3, 4]
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

## For bidirectional layer there are four internal states passed, 2x2 for two LSTM layers having each return carry and hidden states

In [31]:
# 50 tokens embedded in 128 dimensions get mapped to 50 tokens in 512 dimensions
encoder_outputs

<KerasTensor: shape=(None, 50, 512) dtype=float32 (created by layer 'bidirectional')>

In [33]:
# LSTM gives [short, long] memory states, in bi-dir there are two LSTMs so we concatenate
encoder_state = [tf.concat(encoder_state[::2], axis=-1), # short term 0 and 2
                 tf.concat(encoder_state[1::2], axis=-1)] # long term 1 and 3

In [34]:
encoder_state

[<KerasTensor: shape=(None, 512) dtype=float32 (created by layer 'tf.concat')>,
 <KerasTensor: shape=(None, 512) dtype=float32 (created by layer 'tf.concat_1')>]

## Decoder

In [35]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [36]:
decoder_outputs

<KerasTensor: shape=(None, 50, 512) dtype=float32 (created by layer 'lstm_1')>

## Attention layer

In [37]:
attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])

In [38]:
attention_layer

<keras.src.layers.attention.attention.Attention at 0x7fbe9d649880>

In [39]:
attention_outputs

<KerasTensor: shape=(None, 50, 512) dtype=float32 (created by layer 'attention')>

## Output

In [40]:
output_layer = tf.keras.layers.Dense(vocab_size, activation='softmax', dtype=tf.float32)

In [41]:
output_layer

<keras.src.layers.core.dense.Dense at 0x7fbe5e59f820>

In [42]:
Y_proba = output_layer(attention_outputs)

In [43]:
#with tf.device("/GPU:0"):
model = tf.keras.models.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])

In [52]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam', metrics=['accuracy']) #, jit_compile=True)

In [53]:
model.fit((X_train, X_train_dec), Y_train, validation_data=((X_valid, X_valid_dec), Y_valid), epochs=10) 

Epoch 1/10


2023-10-25 20:20:49.226954: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA GeForce RTX 3060 Laptop GPU" frequency: 1425 num_cores: 30 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "11080" } environment { key: "cudnn" value: "8600" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 3145728 shared_memory_size_per_multiprocessor: 102400 memory_size: 4341760000 bandwidth: 336048000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


  10/3125 [..............................] - ETA: 8:54 - loss: 0.6564 - accuracy: 0.8202

KeyboardInterrupt: 

In [46]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])
        X_dec = np.array(["startofseq " + translation])
        y_proba = model.predict((X, X_dec))[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == 'endofseq':
            break
        translation += " " + predicted_word
    return translation.strip()

In [47]:
translate("I like soccer and also going to the beach")

2023-10-25 20:18:31.957082: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA GeForce RTX 3060 Laptop GPU" frequency: 1425 num_cores: 30 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "11080" } environment { key: "cudnn" value: "8600" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 3145728 shared_memory_size_per_multiprocessor: 102400 memory_size: 4341760000 bandwidth: 336048000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }




'me gusta el fútbol y también a la playa'