In [None]:
!pip install "tensorflow-text>=2.11"



In [None]:
import tensorflow as tf
import numpy as np
import pathlib

import tensorflow_text as tf_text

### Data pre-processing

In [None]:
# Download the data file from the cloud storage
path_to_file = tf.keras.utils.get_file(
    'spa-eng.zip',
    'https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True
)

In [None]:
path_to_file = pathlib.Path(path_to_file).parent/'spa-eng/spa.txt'

In [None]:
# Extract the input-output pairs
def load_data(path):

  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = np.array([context for target, context in pairs])
  target = np.array([target for target, context in pairs])

  return target, context

In [None]:
english_text, spanish_text = load_data(path_to_file)

In [None]:
# Print some of the data samples
for i in range(20):
  print(f'{english_text[i]} -> {spanish_text[i]}')

Go. -> Ve.
Go. -> Vete.
Go. -> Vaya.
Go. -> Váyase.
Hi. -> Hola.
Run! -> ¡Corre!
Run. -> Corred.
Who? -> ¿Quién?
Fire! -> ¡Fuego!
Fire! -> ¡Incendio!
Fire! -> ¡Disparad!
Help! -> ¡Ayuda!
Help! -> ¡Socorro! ¡Auxilio!
Help! -> ¡Auxilio!
Jump! -> ¡Salta!
Jump. -> Salte.
Stop! -> ¡Parad!
Stop! -> ¡Para!
Stop! -> ¡Pare!
Wait! -> ¡Espera!


In [None]:
# Text pre-processing pipeline
def tf_lower_and_split_punct(text):

  # Splitting special characters and convert all text to lower case
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)

  # Keep space, a to z, and select punctuation.
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  # Add spaces around punctuation.
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')

  # Strip whitespaces
  text = tf.strings.strip(text)

  # Add start-of-sequence and end-of-sequence characters
  text = tf.strings.join(['[SOS]', text, '[EOS]'], separator=' ')

  return text

In [None]:
# Check the function
example_text = tf.constant('¿Todavía está en casa?')
print(tf_lower_and_split_punct(example_text).numpy().decode())

[SOS] ¿ todavia esta en casa ? [EOS]


In [None]:
# Create the tf datasets for the training and testing data
SAMPLES = len(english_text)
TRAIN_SPLIT = 0.8
BATCH_SIZE = 32

# The samples are ordered, with the simplest first and the hardest last
# To make sure the training and testing datasets are uniform, we sample
# training and testing samples at random.
is_train = np.random.rand(SAMPLES) < TRAIN_SPLIT

train_dataset = tf.data.Dataset.from_tensor_slices((english_text[is_train], spanish_text[is_train]))
train_dataset = train_dataset.batch(BATCH_SIZE)

test_dataset = tf.data.Dataset.from_tensor_slices((english_text[~is_train], spanish_text[~is_train]))
test_dataset = test_dataset.batch(BATCH_SIZE)

In [None]:
# Use a rather small vocabulary size as we don't have a lot of training data
# and this will also speed up training!
max_vocab_size = 1000

# We can now create the text vectorization of the Spanish text by using word tokenization
spanish_vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True)

spanish_vectorize_layer.adapt(train_dataset.map(lambda english, spanish: spanish))

In [None]:
# These are the most common words on the Spanish vocabulary
spanish_vectorize_layer.get_vocabulary()[:20]

['',
 '[UNK]',
 '[SOS]',
 '[EOS]',
 '.',
 'que',
 'de',
 'el',
 'a',
 'no',
 'tom',
 'la',
 '?',
 '¿',
 'en',
 'es',
 'un',
 'se',
 'me',
 ',']

In [None]:
# Now create the English text vectorization
english_vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True)

english_vectorize_layer.adapt(train_dataset.map(lambda english, spanish: english))

In [None]:
english_vectorize_layer.get_vocabulary()[:20]

['',
 '[UNK]',
 '[SOS]',
 '[EOS]',
 '.',
 'the',
 'i',
 'to',
 'you',
 'tom',
 'a',
 '?',
 'is',
 'he',
 'in',
 'of',
 'that',
 'it',
 ',',
 'was']

In [None]:
# Since we need to pass in the shifted outputs to the decoder as inputs we
# need to pre-process the training dataset in this manner
def prepare_inputs(english, spanish):

  english = english_vectorize_layer(english).to_tensor()
  spanish = spanish_vectorize_layer(spanish)

  spanish_decoder_input = spanish[:,:-1].to_tensor()
  spanish_decoder_output = spanish[:,1:].to_tensor()

  return (english, spanish_decoder_input), spanish_decoder_output

In [None]:
train_dataset = train_dataset.map(prepare_inputs, tf.data.AUTOTUNE)
test_dataset = test_dataset.map(prepare_inputs, tf.data.AUTOTUNE)

In [None]:
# Check how this looks
for (english, spanish_decoder_input), spanish_decoder_output in train_dataset.take(1):
  print(english.shape)
  print(spanish_decoder_input.shape)
  print(spanish_decoder_output.shape)

(64, 5)
(64, 6)
(64, 6)


### Creating the model

In [None]:
EMBEDDING_SIZE = 256

# Input layer for encoder and decoder
encoder_inputs = tf.keras.layers.Input(shape=(None,))
decoder_inputs = tf.keras.layers.Input(shape=(None,))

# Embedding layers
encoder_embedding = tf.keras.layers.Embedding(
    input_dim=len(english_vectorize_layer.get_vocabulary()),
    output_dim=EMBEDDING_SIZE, mask_zero=True)(encoder_inputs)

decoder_embedding = tf.keras.layers.Embedding(
    input_dim=len(spanish_vectorize_layer.get_vocabulary()),
    output_dim=EMBEDDING_SIZE, mask_zero=True)(decoder_inputs)

# Encoder (A simple one layer LSTM, you could experiment with more complicated models)
encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embedding)

# Decoder
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embedding, initial_state=[state_h, state_c])

# Output layer
decoder_dense = tf.keras.layers.Dense(len(spanish_vectorize_layer.get_vocabulary()),
                                      activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(train_dataset, validation_data=test_dataset, epochs=20,
                    callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True, monitor='val_accuracy')])

Epoch 1/20
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 40ms/step - accuracy: 0.2954 - loss: 3.2370 - val_accuracy: 0.2171 - val_loss: 3.2966
Epoch 2/20
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 37ms/step - accuracy: 0.4116 - loss: 1.9251 - val_accuracy: 0.3308 - val_loss: 2.2592
Epoch 3/20
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 38ms/step - accuracy: 0.4767 - loss: 1.3358 - val_accuracy: 0.3887 - val_loss: 1.7544
Epoch 4/20
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 40ms/step - accuracy: 0.5158 - loss: 1.0312 - val_accuracy: 0.4189 - val_loss: 1.5501
Epoch 5/20
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 41ms/step - accuracy: 0.5395 - loss: 0.8564 - val_accuracy: 0.4361 - val_loss: 1.4454
Epoch 6/20
[1m1488/1488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 38ms/step - accuracy: 0.5579 - loss: 0.7312 - val_accuracy: 0.4478 - val_loss: 1.3832
Epoc

In [None]:
# Inference with the model
def translate(sentence_en, max_length=50, greedy=True):

  # Pre-process the original input
  sentence_en = english_vectorize_layer(sentence_en)

  # This will build our translation output
  translation = ""

  for word_idx in range(max_length):
    X = np.array([sentence_en]) # encoder input
    X_dec = np.array([translation]) # decoder input
    X_dec = spanish_vectorize_layer(X_dec).to_tensor() # pre-process decoder input
    y_proba = model.predict((X, X_dec))[0, word_idx] # last token's probas
    if greedy: # Use the word with highest probability
      predicted_word_id = np.argmax(y_proba)
    else: # Sample from the probability distribution
      predicted_word_id = np.random.choice(len(spanish_vectorize_layer.get_vocabulary()), p=y_proba)
    predicted_word = spanish_vectorize_layer.get_vocabulary()[predicted_word_id]
    if predicted_word == '[EOS]':
      break
    translation += " " + predicted_word
  return translation.strip()

In [None]:
translate("I like soccer.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


'me gusta el futbol .'

In [None]:
translate("I like soccer and the beach")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step


'me gusta la playa y en la escuela .'