<div style="line-height:0.5">
<h1 style="color:#FF7C00  ">  Seq2Seq in Tensorflow </h1>
<h4>  </h4>
<h3 style="color:lightblue"> Keywords: </h3>
</div>

In [58]:
import os
import random
import numpy as np
import pandas as pd

In [59]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from sklearn.model_selection import train_test_split

In [56]:
from google.colab import files

In [4]:
uploaded = files.upload()

Saving eng-ita.txt to eng-ita.txt


<h2 style="color:#FF7C00  ">  <u> Example 1 </u> </h2>

In [60]:
english_sentences_simple = [
    "hello", "goodbye", "thank you", "please", "yes", "no", "I love you",
    "how are you", "good morning", "good night", "water", "food", "sun",
    "moon", "star", "book", "computer", "phone", "flower", "tree",
    "house", "car", "bus", "train", "sky", "cloud", "rain", "snow",
    "bird", "cat", "dog", "fish", "mountain", "valley", "ocean", "sea",
    "river", "forest", "desert", "city", "village", "country", "king",
    "queen", "prince", "princess", "happy", "sad", "angry", "excited",
    "bored", "tired", "hungry", "thirsty", "hot", "cold", "big", "small",
    "fast", "slow", "up", "down", "left", "right", "day", "night", "light",
    "dark", "young", "old", "man", "woman", "boy", "girl", "friend", "enemy",
    "song", "dance", "jump", "run", "walk", "stop", "go", "come", "push",
    "pull", "open", "close", "hard", "soft", "short", "tall", "wide", "narrow"
]


In [61]:
italian_sentences_simple = [
    "ciao", "addio", "grazie", "per favore", "sì", "no", "ti amo",
    "come stai", "buongiorno", "buonanotte", "acqua", "cibo", "sole",
    "luna", "stella", "libro", "computer", "telefono", "fiore", "albero",
    "casa", "auto", "autobus", "treno", "cielo", "nuvola", "pioggia", "neve",
    "uccello", "gatto", "cane", "pesce", "montagna", "valle", "oceano", "mare",
    "fiume", "foresta", "deserto", "città", "villaggio", "paese", "re",
    "regina", "principe", "principessa", "felice", "triste", "arrabbiato", "eccitato",
    "annoito", "stanco", "affamato", "assetato", "caldo", "freddo", "grande", "piccolo",
    "veloce", "lento", "su", "giù", "sinistra", "destra", "giorno", "notte", "luce",
    "scuro", "giovane", "vecchio", "uomo", "donna", "ragazzo", "ragazza", "amico", "nemico",
    "canzone", "danza", "salta", "corri", "cammina", "ferma", "vai", "vieni", "spingi",
    "tira", "apri", "chiudi", "duro", "morbido", "corto", "alto", "largo", "stretto"
]


<h3 style="color:#FF7C00  ">  Preprocessing </h3>

In [62]:
tokenizer_eng = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer_eng.fit_on_texts(english_sentences_simple)
english_seq = tokenizer_eng.texts_to_sequences(english_sentences_simple)

tokenizer_ita = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer_ita.fit_on_texts(italian_sentences_simple)
ita_seq = tokenizer_ita.texts_to_sequences(italian_sentences_simple)

max_len_eng = max([len(seq) for seq in english_seq])
max_len_frn = max([len(seq) for seq in ita_seq])

english_seq = keras.preprocessing.sequence.pad_sequences(english_seq, maxlen=max_len_eng, padding='post')
ita_seq = keras.preprocessing.sequence.pad_sequences(ita_seq, maxlen=max_len_frn, padding='post')

<h3 style="color:#FF7C00  ">  Seq2Seq Model </h3>

In [63]:
#### Parameters
embedding_dim = 50
lstm_units = 128
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_ita = len(tokenizer_ita.word_index) + 1

In [64]:
###### Encoder
encoder_input = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size_eng, embedding_dim)(encoder_input)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_output, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding)
encoder_states = [encoder_state_h, encoder_state_c]

####### Decoder
decoder_input = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size_ita, embedding_dim)(decoder_input)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_ita, activation='softmax')
decoder_output = decoder_dense(decoder_output)

model = keras.Model([encoder_input, decoder_input], decoder_output)

In [65]:
# Encoder inference model
encoder_model = keras.Model(encoder_input, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_inference = Embedding(vocab_size_ita, embedding_dim)(decoder_input)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding_inference, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = keras.Model([decoder_input] + decoder_states_inputs, [decoder_outputs] + decoder_states)



In [66]:
""" Training """

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Splitting data for training and testing
eng_train, eng_val, frn_train, frn_val = train_test_split(english_seq, ita_seq, test_size=0.2)

model.fit([eng_train, frn_train[:, :-1]], frn_train[:, 1:],
            validation_data=([eng_val, frn_val[:, :-1]], frn_val[:, 1:]),
            batch_size=2, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7920c45aaf20>

In [50]:
def translate(input_sentence):
    # Tokenize and pad the input sentence
    input_seq = tokenizer_eng.texts_to_sequences([input_sentence])
    input_seq = keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len_eng, padding='post')

    # Get the encoder states
    states_value = encoder_model.predict(input_seq)

    # Start token for the decoder, using the first word in our dictionary as the start point
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = 1  # Let's use the first word index as a starting point

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Check if the index is in the dictionary
        if sampled_token_index in tokenizer_ita.index_word:
            sampled_char = tokenizer_ita.index_word[sampled_token_index]
            decoded_sentence += ' ' + sampled_char
        else:
            print("does not exist in dictionary!")
            break  # Exit if the index isn't in the dictionary

        # Exit loop if max length is reached
        if len(decoded_sentence.split()) > max_len_frn:
            stop_condition = True

        # Update the target_seq and states for the next loop iteration
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()


In [67]:
input_sentence = "hello"
predicted_translation = translate(input_sentence)
predicted_translation

does not exist in dictionary!


'è è è ho tom'

<div style="line-height:0.5">
<h3 style="color:#FF7C00  "> Note: </h3>
</div>
Clearly, the prediction is wrong! Seq2Seq models typically require large datasets to produce accurate translations.

<h2 style="color:#FF7C00  ">  <u> Example 2 </u> </h2>

In [68]:
english_sentences = []
italian_sentences = []

with open("/content/eng-ita.txt", "r", encoding="utf-8") as f:    #./data/eng-ita if not on Colab
    lines = f.readlines()
    for line in lines:
        eng, ita = line.strip().split("\t")
        english_sentences.append(eng)
        italian_sentences.append(ita)


In [69]:
english_sentences[:5], italian_sentences[:5]

(['Hi.', 'Run!', 'Run!', 'Run!', 'Who?'],
 ['Ciao!', 'Corri!', 'Corra!', 'Correte!', 'Chi?'])

In [70]:
%%script echo skipping
""" Use only 20% of the data to avoid memory allocation problems. """

# num_samples = int(0.20 * len(english_sentences))
# # Randomly sample indices
# sampled_indices = random.sample(range(len(english_sentences)), num_samples)

# ## Use indices to sample from lists
# english_sentences_sampled = [english_sentences[i] for i in sampled_indices]
# italian_sentences_sampled = [italian_sentences[i] for i in sampled_indices]

skipping


In [71]:
# Tokenization using TensorFlow's Keras API
tokenizer_eng = tf.keras.preprocessing.text.Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
english_sequences = tokenizer_eng.texts_to_sequences(english_sentences)
vocab_size_eng = len(tokenizer_eng.word_index) + 1
max_len_eng = max([len(seq) for seq in english_sequences])

tokenizer_ita = tf.keras.preprocessing.text.Tokenizer()
tokenizer_ita.fit_on_texts(italian_sentences)
italian_sequences = tokenizer_ita.texts_to_sequences(italian_sentences)
vocab_size_ita = len(tokenizer_ita.word_index) + 1
max_len_frn = max([len(seq) for seq in italian_sequences])

# Padding sequences
english_sequences = tf.keras.preprocessing.sequence.pad_sequences(english_sequences, maxlen=max_len_eng, padding='post')
italian_sequences = tf.keras.preprocessing.sequence.pad_sequences(italian_sequences, maxlen=max_len_frn, padding='post')

In [72]:
#### Parameters
embedding_dim = 50
lstm_units = 128
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_ita = len(tokenizer_ita.word_index) + 1

In [73]:
###### Encoder
encoder_input = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size_eng, embedding_dim)(encoder_input)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_output, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding)
encoder_states = [encoder_state_h, encoder_state_c]

####### Decoder
decoder_input = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size_ita, embedding_dim)(decoder_input)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_ita, activation='softmax')
decoder_output = decoder_dense(decoder_output)

model = keras.Model([encoder_input, decoder_input], decoder_output)

In [74]:
# Encoder inference model
encoder_model = keras.Model(encoder_input, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_inference = Embedding(vocab_size_ita, embedding_dim)(decoder_input)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding_inference, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = keras.Model([decoder_input] + decoder_states_inputs, [decoder_outputs] + decoder_states)


In [76]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [142]:
# model.fit([english_sequences, italian_sequences[:,:-1]], keras.utils.to_categorical(italian_sequences[:,1:], num_classes=vocab_size_ita),
#                                                                                     batch_size=64,
#                                                                                     epochs=5,
#                                                                                     validation_split=0.2)


model.fit(
    [english_sequences, italian_sequences[:,:-1]],  # Inputs
    italian_sequences[:,1:],                        # Targets
    batch_size=64,
    epochs=100,
    validation_split=0.2
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x79201c1378b0>

In [143]:
def translate(input_sentence):
    # Tokenize and pad the input sentence
    input_seq = tokenizer_eng.texts_to_sequences([input_sentence])
    input_seq = keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len_eng, padding='post')

    # Get the encoder states
    states_value = encoder_model.predict(input_seq)

    # Start token for the decoder using the most frequent word in Italian sequences
    start_word = tokenizer_ita.index_word[1]
    start_index = tokenizer_ita.word_index[start_word]
    target_seq = np.zeros((1, 1))
    #target_seq[0, 0] = start_index
    target_seq[0, 0] = 1  # Let's use the first word index as a starting point

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Check if the index is in the dictionary
        if sampled_token_index in tokenizer_ita.index_word:
            sampled_char = tokenizer_ita.index_word[sampled_token_index]
            decoded_sentence += ' ' + sampled_char
        else:
            print("does not exist in dictionary!")
            break  # Exit if the index isn't in the dictionary

        # Exit loop if max length is reached or if the token is a repeat of the start word
        if (len(decoded_sentence.split()) > max_len_frn) or (sampled_char == start_word):
            stop_condition = True

        # Update the target_seq and states for the next loop iteration
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()


In [144]:
#%%script echo skipping
def translate2(input_sentence):
    # Tokenize and pad the input sentence
    input_seq = tokenizer_eng.texts_to_sequences([input_sentence])
    input_seq = keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len_eng, padding='post')

    # Get the encoder states
    states_value = encoder_model.predict(input_seq)

    # Initialize the starting sequence for the decoder with a value of 0
    # (this assumes 0 is not a valid token in your dictionary, but just a padding value)
    target_seq = np.zeros((1, 1))

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        print(output_tokens[0, -1, :])

        # Get the token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Check if the index is in the dictionary
        if sampled_token_index in tokenizer_ita.index_word:
            sampled_word = tokenizer_ita.index_word[sampled_token_index]
            decoded_sentence += ' ' + sampled_word

            # Exit loop if max length is reached or if we encounter an end token (this can be adjusted)
            if (len(decoded_sentence.split()) > max_len_frn):
                stop_condition = True

        else:
            print("does not exist in dictionary!")
            break

        # Update the target_seq and states for the next loop iteration
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()


skipping


In [145]:
def translate3(input_sentence):
    input_seq = tokenizer_eng.texts_to_sequences([input_sentence])
    input_seq = keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len_eng, padding='post')

    states_value = encoder_model.predict(input_seq)

    # Change this to use a special <start> token if you added one, else keep as is
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = 1

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Print this for debugging
        print(output_tokens)

        predicted_index = np.argmax(output_tokens[0, -1, :])

        if predicted_index in tokenizer_ita.index_word:
            sampled_char = tokenizer_ita.index_word[predicted_index]
            decoded_sentence += ' ' + sampled_char
        else:
            print("Index not in dictionary:", predicted_index)
            break

        if (len(decoded_sentence.split()) > max_len_frn) or (sampled_char == tokenizer_ita.index_word[1]):
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = predicted_index
        states_value = [h, c]

    return decoded_sentence.strip()


In [133]:
input_sentence = "Ask Tom"
predicted_translation = translate3(input_sentence)
predicted_translation

[[[7.8913952e-05 9.6949625e-01 7.8210903e-08 ... 7.0891931e-10
   7.8226342e-10 7.0940898e-10]]]


'tom'

In [141]:
input_sentence = "Go slow."
predicted_translation = translate(input_sentence)
predicted_translation

[[[6.9520521e-01 7.1514834e-05 7.0451910e-07 ... 2.1985787e-08
   2.1940334e-08 2.1700236e-08]]]
Index not in dictionary: 0


''

In [147]:
input_sentence = "Go slow."
predicted_translation = translate2(input_sentence)
predicted_translation

does not exist in dictionary!


'vada'

In [146]:
input_sentence = "Go slow."
predicted_translation = translate(input_sentence)
predicted_translation

does not exist in dictionary!


'vada'

In [148]:
input_sentence = "Call me tomorrow."
predicted_translation = translate(input_sentence)
predicted_translation

does not exist in dictionary!


'questa domani oggi oggi'

In [150]:
input_sentence = "Call me."
predicted_translation = translate2(input_sentence)
predicted_translation

does not exist in dictionary!


'mi'

In [151]:
input_sentence = "I got fat."
predicted_translation = translate(input_sentence)
predicted_translation

does not exist in dictionary!


'sono'

In [159]:
input_sentence = "know."
predicted_translation = translate(input_sentence)
predicted_translation

does not exist in dictionary!


'lo'

In [153]:
input_sentence = "Get Away!"
predicted_translation = translate(input_sentence)
predicted_translation

does not exist in dictionary!


''

In [154]:
input_sentence = "I shaved"
predicted_translation = translate(input_sentence)
predicted_translation

does not exist in dictionary!


'la'

In [158]:
input_sentence = "Let's go!"
predicted_translation = translate3(input_sentence)
predicted_translation

[[[9.9999070e-01 7.6397136e-18 1.5178664e-18 ... 9.4466676e-13
   8.9033479e-13 9.1798411e-13]]]
Index not in dictionary: 0


''