# Machine translation using a Decoder-Encoder architecture

In [9]:
# download the dataset
!wget https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
!unzip -q spa-eng.zip

--2023-05-16 09:43:13--  https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.180.144, 142.250.180.176, 142.251.209.16, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.180.144|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2638744 (2.5M) [application/zip]
Saving to: ‘spa-eng.zip’


2023-05-16 09:43:14 (2.40 MB/s) - ‘spa-eng.zip’ saved [2638744/2638744]



## Data preparation

In [1]:
text_file = "spa-eng/spa.txt"
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
text_pairs = []

In [2]:
for line in lines:
    english, spanish = line.split("\t")
    spanish = "[start] " + spanish + " [end]"
    text_pairs.append((english, spanish))

In [3]:
import random
print(random.choice(text_pairs))

('He married my sister.', '[start] Él se casó con mi hermana. [end]')


In [4]:
import random
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:num_train_samples + num_val_samples + 3000]

In [5]:
import tensorflow as tf
import keras.layers as layers
import keras
import string
import re

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")

vocab_size = 15000
sequence_length = 20

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)

train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

2023-05-20 08:32:49.773790: E tensorflow/core/lib/monitoring/collection_registry.cc:77] Cannot register 2 metrics with the same name: /tensorflow/core/saved_model/write/count
2023-05-20 08:32:49.773866: E tensorflow/core/lib/monitoring/collection_registry.cc:77] Cannot register 2 metrics with the same name: /tensorflow/core/saved_model/read/count
2023-05-20 08:32:49.773884: E tensorflow/core/lib/monitoring/collection_registry.cc:77] Cannot register 2 metrics with the same name: /tensorflow/core/saved_model/write/api
2023-05-20 08:32:49.773891: E tensorflow/core/lib/monitoring/collection_registry.cc:77] Cannot register 2 metrics with the same name: /tensorflow/core/saved_model/read/api
2023-05-20 08:32:52.151670: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operation

In [6]:
batch_size = 64

def format_dataset(eng, spa):
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    return ({
        "english": eng,
        "spanish": spa[:, :-1],
    }, spa[:, 1:])

def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [7]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['spanish'].shape: {inputs['spanish'].shape}")
    print(f"targets.shape: {targets.shape}")

2023-05-20 08:33:42.133353: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:164] None of the MLIR Optimization Passes are enabled (registered 2)


inputs['english'].shape: (64, 20)
inputs['spanish'].shape: (64, 20)
targets.shape: (64, 20)


2023-05-20 08:33:42.498250: W tensorflow/core/kernels/data/cache_dataset_ops.cc:764] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Encoder

In [8]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([layers.Dense(dense_dim, activation="relu"),layers.Dense(embed_dim),])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

## Decoder

In [9]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
            layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
             [tf.expand_dims(batch_size, -1),
              tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

## Encoder-Decoder model

In [10]:
embed_dim = 256
dense_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="spanish")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [95]:
transformer.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
english (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
spanish (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
positional_embedding (Positiona (None, None, 256)    3845120     english[0][0]                    
__________________________________________________________________________________________________
positional_embedding_1 (Positio (None, None, 256)    3845120     spanish[0][0]                    
______________________________________________________________________________________________

In [11]:
callbacks = [
 tf.keras.callbacks.ModelCheckpoint("sequence-to-sequence/transformer.keras", save_best_only=True),
tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
]

## Training

In [12]:
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
transformer.fit(train_ds, epochs=30, validation_data=val_ds, callbacks=callbacks)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30


<tensorflow.python.keras.callbacks.History at 0x7f238d3b2be0>

In [13]:
transformer.save("sequence-to-sequence/transformer_best.keras")
transformer.save_weights("sequence-to-sequence/weights/transformer_best_weights")

## Evaluation

In [25]:
test_ds = make_dataset(test_pairs)

In [19]:
transformer = keras.models.load_model(
    "sequence-to-sequence/transformer_best.keras",
    custom_objects={"TransformerEncoder": TransformerEncoder,
                    "PositionalEmbedding": PositionalEmbedding,
                    "TransformerDecoder": TransformerDecoder})

### Float32

In [26]:
from tensorflow.keras import backend as k
import numpy as np

k.set_floatx('float32')

In [27]:
print(f"Test acc: {transformer.evaluate(test_ds)[1]:.3f}")

Test acc: 0.649


Some translation examples

In [38]:
spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence, model):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization(
            [decoded_sentence])[:, :-1]
        predictions = model(
            [tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence, transformer))

-
I know someone needs to tell Tom about Mary's death. Does it have to be me?
[start] sé que alguien necesita saber por la muerte de mary es ser como para ser quien es para mí [end]
-
Tom became tired of always having to pay the bill every time he went out with Mary.
[start] tom se quedó casi siempre para escribir la cuenta cuando se había ido a mary a salir a john [end]
-
I need to know everything that happened.
[start] necesito saber todo lo que pasó [end]
-
Tom was talking to himself.
[start] tom estaba hablando sola [end]
-
Let's open the window.
[start] [UNK] a la ventana [end]
-
I'm not materialistic.
[start] no soy de la [UNK] [end]
-
Tom is passed out in bed.
[start] tom se ha pasado en cama [end]
-
You had better be ready in case he comes.
[start] será mejor que te [UNK] en caso de que vendrá [end]
-
I know how to cook dinner.
[start] yo sé cantar por la cena [end]
-
We don't have that long.
[start] no tenemos mucho tiempo [end]
-
You'll have to stand on your toes to see.
[sta

### Float16

In [28]:
k.set_floatx('float16')

# Get the original weights
ws = transformer.get_weights()
print(np.unique([w.dtype for w in transformer.get_weights()]))

[dtype('float32')]


In [29]:
# Convert the weights to Posit <16,0> and load a new model
wsp = [w.astype(k.floatx()) for w in ws]

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="spanish")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
model_float16 = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model_float16.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
model_float16.set_weights(wsp)

print(np.unique([w.dtype for w in model_float16.get_weights()]))

[dtype('float16')]


In [30]:
print(f"Test acc: {model_float16.evaluate(test_ds)[1]:.3f}")

Test acc: 0.649


### Posit16,0

In [32]:
k.set_floatx('posit160')

# Get the original weights
ws = transformer.get_weights()
np.unique([w.dtype for w in transformer.get_weights()])

array([dtype('float32')], dtype=object)

In [33]:
# Convert the weights to Posit <16,0> and load a new model
wsp = [w.astype(k.floatx()) for w in ws]

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="spanish")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
model_posit = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model_posit.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
model_posit.set_weights(wsp)

print(np.unique([w.dtype for w in model_posit.get_weights()]))

[dtype(posit160)]


In [34]:
print(f"Test acc: {model_posit.evaluate(test_ds)[1]}")

Test acc: 0.651367


## Rouge metric

In [91]:
from rouge import Rouge

def get_translations(model):
    references = []
    translations = []

    for test_pair in test_pairs[:128]:
        input_sentence = test_pair[0]
        reference_sentence = test_pair[1]\
            .replace("[start]", "")\
            .replace("[end]", "")\
            .strip()
        references.append(reference_sentence)

        translated_sentence = decode_sequence(input_sentence, model)
        translated_sentence = (
            translated_sentence
            .replace("[start]", "")
            .replace("[end]", "")
            .strip()
        )
        translations.append(translated_sentence)

    return references, translations

rouge = Rouge()

### Float32

In [92]:
references, translations = get_translations(transformer)
rouge.get_scores(translations, references, avg=True)

{'rouge-1': {'r': 0.3641561129842379,
  'p': 0.3821077793734044,
  'f': 0.36716126583689707},
 'rouge-2': {'r': 0.1713524756493506,
  'p': 0.1771422371031746,
  'f': 0.17248166071547022},
 'rouge-l': {'r': 0.3546599234099233,
  'p': 0.3716539103257853,
  'f': 0.35731017942331067}}

### Float16

In [93]:
references, translations_float16 = get_translations(model_float16)
rouge.get_scores(translations_float16, references, avg=True)

{'rouge-1': {'r': 0.3628540296509046,
  'p': 0.3799376404845155,
  'f': 0.36561116666557214},
 'rouge-2': {'r': 0.1713524756493506,
  'p': 0.17735925099206346,
  'f': 0.1725730349848501},
 'rouge-l': {'r': 0.35335784007659,
  'p': 0.36939696588134086,
  'f': 0.35572287787103335}}

### Posit16,0

In [94]:
references, translations_posit = get_translations(model_posit)
rouge.get_scores(translations_posit, references, avg=True)

{'rouge-1': {'r': 0.37016429750804747,
  'p': 0.3826610238719614,
  'f': 0.37181635210126407},
 'rouge-2': {'r': 0.17864414231601725,
  'p': 0.18357204861111104,
  'f': 0.17952310076768963},
 'rouge-l': {'r': 0.36249440663503163,
  'p': 0.37471211514180264,
  'f': 0.36403056150297347}}

## Input conversion to Posit16,0

In [79]:
print(list(test_ds)[0])

({'english': <tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[   9, 1214,    7, ...,    0,    0,    0],
       [  75,   58,    0, ...,    0,    0,    0],
       [   6,  781,    4, ...,    0,    0,    0],
       ...,
       [   3,   64,  544, ...,    0,    0,    0],
       [1713,    7, 1095, ...,    0,    0,    0],
       [  77,  109,  343, ...,    0,    0,    0]])>, 'spanish': <tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[   2,   20, 2598, ...,    0,    0,    0],
       [   2,  530,    3, ...,    0,    0,    0],
       [   2,    8,  520, ...,    0,    0,    0],
       ...,
       [   2,    7, 2783, ...,    0,    0,    0],
       [   2,    1,   18, ...,    0,    0,    0],
       [   2,   35, 5253, ...,    0,    0,    0]])>}, <tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[  20, 2598,   18, ...,    0,    0,    0],
       [ 530,    3,    0, ...,    0,    0,    0],
       [   8,  520,  122, ...,    0,    0,    0],
       ...,
       [   7, 2783,   10, ...,    0,   

In [80]:
test_ds_posit = test_ds.map(lambda x, y: ({"english": tf.cast(x['english'], tf.posit160), "spanish": tf.cast(x['spanish'], tf.posit160)}, tf.cast(y, tf.posit160)))

In [81]:
print(list(test_ds_posit)[0])

({'english': <tf.Tensor: shape=(64, 20), dtype=posit160, numpy=
array([[9, 1214, 7, ..., 0, 0, 0],
       [75, 58, 0, ..., 0, 0, 0],
       [6, 781, 4, ..., 0, 0, 0],
       ...,
       [3, 64, 544, ..., 0, 0, 0],
       [1712, 7, 1094, ..., 0, 0, 0],
       [77, 109, 343, ..., 0, 0, 0]], dtype=posit160)>, 'spanish': <tf.Tensor: shape=(64, 20), dtype=posit160, numpy=
array([[2, 20, 2596, ..., 0, 0, 0],
       [2, 530, 3, ..., 0, 0, 0],
       [2, 8, 520, ..., 0, 0, 0],
       ...,
       [2, 7, 2780, ..., 0, 0, 0],
       [2, 1, 18, ..., 0, 0, 0],
       [2, 35, 5248, ..., 0, 0, 0]], dtype=posit160)>}, <tf.Tensor: shape=(64, 20), dtype=posit160, numpy=
array([[20, 2596, 18, ..., 0, 0, 0],
       [530, 3, 0, ..., 0, 0, 0],
       [8, 520, 122, ..., 0, 0, 0],
       ...,
       [7, 2780, 10, ..., 0, 0, 0],
       [1, 18, 1430, ..., 0, 0, 0],
       [35, 5248, 42, ..., 0, 0, 0]], dtype=posit160)>)


In [82]:
print(f"Test acc: {transformer.evaluate(test_ds_posit)[1]:.3f}")

Test acc: 0.611


## Saving the models

In [90]:
transformer.save("sequence-to-sequence/transformer_best.keras")
model_float16.save("sequence-to-sequence/transformer_float16_best.keras")
model_posit.save("sequence-to-sequence/transformer_posit_best.keras")