In [1]:
!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
!unzip -q spa-eng.zip

--2023-12-28 17:30:01--  http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.201.91, 142.250.184.27, 142.250.185.27, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.201.91|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2638744 (2.5M) [application/zip]
Saving to: ‘spa-eng.zip’


2023-12-28 17:30:01 (3.46 MB/s) - ‘spa-eng.zip’ saved [2638744/2638744]



In [3]:
text_file = "./spa-eng/spa.txt"
with open(text_file, "r") as f:
    lines = f.read().split("\n")[:-1]
text_pairs = []

for line in lines:
    english, spanish = line.split("\t")
    spanish = "[start] " + spanish + " [end]"
    text_pairs.append((english, spanish))

In [4]:
import random
print(random.choice(text_pairs))

("A painter's eyes are his most important tools.", '[start] Los ojos de un pintor son sus herramientas más importantes. [end]')


In [14]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2* num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

In [10]:
import tensorflow as tf
import string
import re

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")

In [11]:
from tensorflow.keras import layers

vocab_size = 15000
sequence_length = 20

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length
)

target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)

train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

In [13]:
batch_size = 64

def format_dataset(eng, spa):
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    return ({
        "english": eng,
        "spanish": spa[:, :-1]
    }, spa[:, 1:])
        
def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(buffer_size=16).cache()

In [15]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)
test_ds = make_dataset(test_pairs)

In [16]:
inputs, targets = next(iter(train_ds))
print(inputs['english'].shape)
print(inputs['spanish'].shape)
print(targets.shape)

(64, 20)
(64, 20)
(64, 20)


2023-12-28 17:45:31.221148: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [17]:
from tensorflow import keras

embed_dim = 256
latent_dim = 1024

source = keras.Input(shape=(None,), dtype='int64', name='english')
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source)
encoded_source = layers.Bidirectional(layers.GRU(latent_dim), merge_mode="sum")(x)

past_target = keras.Input(shape=(None,), dtype='int64', name='spanish')
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(past_target)
decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(vocab_size, activation='softmax')(x)
seq2seq_rnn = keras.Model([source, past_target], target_next_step)

2023-12-28 17:51:36.875995: W external/local_xla/xla/stream_executor/gpu/asm_compiler.cc:225] Falling back to the CUDA driver for PTX compilation; ptxas does not support CC 8.9
2023-12-28 17:51:36.876008: W external/local_xla/xla/stream_executor/gpu/asm_compiler.cc:228] Used ptxas at ptxas
2023-12-28 17:51:36.876050: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile generated PTX with ptxas. Falling back to compilation by driver.


In [18]:
seq2seq_rnn.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
seq2seq_rnn.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 spanish (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 256)            3840000   ['english[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 256)            3840000   ['spanish[0][0]']             
                                                                                              

In [19]:
callbacks = [keras.callbacks.ModelCheckpoint('seq2seq_rnn.keras', save_best_only=True)]
seq2seq_rnn.fit(train_ds, epochs=15, validation_data=val_ds, callbacks=callbacks)

Epoch 1/15


2023-12-28 17:53:34.478277: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	for Tuple type infernce function 0
	while inferring type of node 'cond_35/output/_22'
2023-12-28 17:53:34.597636: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile generated PTX with ptxas. Falling back to compilation by driver.
2023-12-28 17:53:34.657466: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to co

Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7f53bc60b210>

In [21]:
import numpy as np

spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookop = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sequence):
    tokenized_input_sequence = source_vectorization([input_sequence])
    decoded_sequence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sequence = target_vectorization([decoded_sequence])
        next_token_prediction = seq2seq_rnn.predict([tokenized_input_sequence, tokenized_target_sequence], verbose=0)
        sampled_token_index = np.argmax(next_token_prediction[0, i, :])
        sampled_token = spa_index_lookop[sampled_token_index]
        decoded_sequence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sequence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(max_decoded_sentence_length):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
Tom planted some flower seeds in his garden.
[start] tom plantó un poco de [UNK] en su jardín [end]
-
There are many words with meanings I don't know.
[start] hay muchas palabras con no lo que sé [end]
-
My father has never gotten sick in his life.
[start] mi padre nunca ha estado en su vida [end]
-
Has Tom gone insane?
[start] ha tom se ha vuelto loco [end]
-
Tom jumped off a cliff.
[start] tom [UNK] un [UNK] [end]
-
It's even very cold in May.
[start] está muy cansado así ahora [end]
-
We heard her cry.
[start] oímos a su hijo [end]
-
The doctor advised him not to smoke.
[start] el doctor le aconsejó que no se [UNK] [end]
-
He can't have said such a stupid thing.
[start] Él no puede haber dicho una cosa muy [end]
-
This is a real breakthrough.
[start] esto es un [UNK] [end]
-
I'm here to help.
[start] estoy aquí para ayudar [end]
-
That was the idea.
[start] ese era la idea [end]
-
Isn't that a little harsh?
[start] no es un poco de [UNK] [end]
-
She wants to play golf with him.
[s

In [23]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embedding = layers.Embedding(input_dim=input_dim, output_dim=output_dim)
        self.position_embedding = layers.Embedding(input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
        
    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embedding(inputs)
        embedded_positions = self.position_embedding(positions)
        embedding = embedded_tokens + embedded_positions
        return embedding
    
    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim
        })
        return config

In [24]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dense_dim = dense_dim
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_projection = keras.Sequential([layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim)])
        self.layer_norm1 = layers.LayerNormalization()
        self.layer_norm2 = layers.LayerNormalization()
        
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        projection_input = self.layer_norm1(inputs + attention_output) 
        projection_output = self.dense_projection(projection_input)
        return self.layer_norm2(projection_output + projection_input)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'dense_dim': self.dense_dim,
        })
        return config

In [27]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_projection = keras.Sequential([layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim)])
        self.layer_norm1 = layers.LayerNormalization()
        self.layer_norm2 = layers.LayerNormalization()
        self.layer_norm3 = layers.LayerNormalization()
        self.supports_masking = True
        
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'dense_dim': self.dense_dim,
            'num_heads': self.num_heads
        })
        return config
    
    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, seq_length = input_shape[0], input_shape[1]
        i = tf.range(seq_length)[:, tf.newaxis]
        j = tf.range(seq_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype="int32")], axis=0)
        return tf.tile(mask, mult)
    
    def call(self, inputs, enconder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        attention_output_1 = self.attention1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask 
        )
        attention_output_1 = self.layer_norm1(inputs + attention_output_1)
        attention_output_2 = self.attention2(
            query=attention_output_1, value=enconder_outputs, key=enconder_outputs, attention_mask=padding_mask
        )
        attention_output_2 = self.layer_norm2(attention_output_1 + attention_output_2)
        projection_output = self.dense_projection(attention_output_2)
        return self.layer_norm3(projection_output + attention_output_2)

In [29]:
embed_dim = 256
num_heads = 8
dense_dim = 2048

encoder_inputs = keras.Input(shape=(None, ), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs =TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None, ), dtype="int64", name="spanish")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
transformer = keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[decoder_outputs])

In [30]:
transformer.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [31]:
transformer.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 spanish (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 positional_embedding_4 (Po  (None, None, 256)            3845120   ['english[0][0]']             
 sitionalEmbedding)                                                                               
                                                                                                  
 positional_embedding_5 (Po  (None, None, 256)            3845120   ['spanish[0][0]']       

In [32]:
transformer.fit(train_ds, validation_data=val_ds, epochs=30)

Epoch 1/30


2023-12-28 20:01:42.698098: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile generated PTX with ptxas. Falling back to compilation by driver.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7f5331962fd0>

In [36]:
spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sequence):
    tokenized_input_sequence = source_vectorization([input_sequence])
    decoded_sequence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sequence = target_vectorization([decoded_sequence])[:, :-1]
        next_token_prediction = transformer.predict([tokenized_input_sequence, tokenized_target_sequence], verbose=0)
        sampled_token_index = np.argmax(next_token_prediction[0, i, :])
        sampled_token = spa_index_lookop[sampled_token_index]
        decoded_sequence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sequence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
I went to the park yesterday.
[start] ayer fui al parque [end]
-
I've never been more proud of you.
[start] nunca he estado más orgulloso de ti [end]
-
You'll understand it right away.
[start] lo [UNK] de verdad [end]
-
Get your mind out of the gutter!
[start] [UNK] de la cabeza [end]
-
I need to go somewhere and think.
[start] necesito ir a lugar en alguna parte [end]
-
There is an apple on the table.
[start] hay una manzana sobre la mesa [end]
-
I have some presents for you.
[start] tengo libros para comprar ustedes [end]
-
He didn't get caught.
[start] Él no levantó [end]
-
It's raining very hard.
[start] está lloviendo muy difícil [end]
-
I don't know her and I don't think I want to.
[start] no sé que no y yo no quiero pensar [end]
-
That sounds scary.
[start] eso me parece la miedo [end]
-
You are not old enough to go swimming by yourself.
[start] tú no eres lo suficientemente mayor para ir a menudo [end]
-
He failed to catch the 8:30 train.
[start] hizo [UNK] para el tren de la

In [45]:
embed_dim = 256
num_heads = 8
dense_dim = 2048

encoder_inputs = keras.Input(shape=(None, ), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None, ), dtype="int64", name="spanish")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
transformer = keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[decoder_outputs])

In [46]:
transformer.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 positional_embedding_10 (P  (None, None, 256)            3845120   ['english[0][0]']             
 ositionalEmbedding)                                                                              
                                                                                                  
 transformer_encoder_13 (Tr  (None, None, 256)            3155456   ['positional_embedding_10[0][0
 ansformerEncoder)                                                  ]']                           
                                                                                            

In [47]:
transformer.compile(optimizer='rmsprop', loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [48]:
transformer.fit(train_ds, validation_data=val_ds, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7f50541f7890>

In [49]:
test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
That boy is intelligent.
[start] ese chico es inteligente [end]
-
I'd like to hear you talk more about that.
[start] me gustaría oír algo más [end]
-
Are you Tom's brother?
[start] es usted el hermano de tom [end]
-
She is beautiful.
[start] es guapa [end]
-
What happened to you two?
[start] qué te pasó a ti dos [end]
-
Tom should be jailed.
[start] tom debería ser [UNK] [end]
-
The police are looking for suspects.
[start] la policía está buscando a los [UNK] [end]
-
I find myself in a rather delicate situation.
[start] yo encontré un [UNK] en una situación eso [end]
-
What's so strange about that?
[start] qué es tan extraño [end]
-
We're having a meeting at 2:30.
[start] estamos tener una reunión a las dos y media [end]
-
Can I call my friend in Japan?
[start] puede llamar a mi amigo en japón [end]
-
It can happen to anybody.
[start] le puede pasar a cualquiera [end]
-
Leaving the children alone was sheer thoughtlessness.
[start] dejar a los niños solo [UNK] [UNK] [end]
-
My father 