<div style="line-height:0.5">
<h1 style="color:#FF7C00  ">  Seq2Seq in Tensorflow </h1>
<h4>  </h4> 
<h3 style="color:lightblue"> Keywords: </h3>  keras pad_sequences() + 
</div> 

In [None]:
import os
import random
import numpy as np
import pandas as pd

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'       
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from sklearn.model_selection import train_test_split

<h2 style="color:#FF7C00  ">  <u> Example 1 </u> </h2>

In [None]:
english_sentences = [
    "hello", "goodbye", "thank you", "please", "yes", "no", "I love you",
    "how are you", "good morning", "good night", "water", "food", "sun",
    "moon", "star", "book", "computer", "phone", "flower", "tree", 
    "house", "car", "bus", "train", "sky", "cloud", "rain", "snow",
    "bird", "cat", "dog", "fish", "mountain", "valley", "ocean", "sea",
    "river", "forest", "desert", "city", "village", "country", "king",
    "queen", "prince", "princess", "happy", "sad", "angry", "excited",
    "bored", "tired", "hungry", "thirsty", "hot", "cold", "big", "small",
    "fast", "slow", "up", "down", "left", "right", "day", "night", "light",
    "dark", "young", "old", "man", "woman", "boy", "girl", "friend", "enemy",
    "song", "dance", "jump", "run", "walk", "stop", "go", "come", "push",
    "pull", "open", "close", "hard", "soft", "short", "tall", "wide", "narrow"
]


In [None]:
italian_sentences = [
    "ciao", "addio", "grazie", "per favore", "sì", "no", "ti amo",
    "come stai", "buongiorno", "buonanotte", "acqua", "cibo", "sole",
    "luna", "stella", "libro", "computer", "telefono", "fiore", "albero",
    "casa", "auto", "autobus", "treno", "cielo", "nuvola", "pioggia", "neve",
    "uccello", "gatto", "cane", "pesce", "montagna", "valle", "oceano", "mare",
    "fiume", "foresta", "deserto", "città", "villaggio", "paese", "re",
    "regina", "principe", "principessa", "felice", "triste", "arrabbiato", "eccitato",
    "annoito", "stanco", "affamato", "assetato", "caldo", "freddo", "grande", "piccolo",
    "veloce", "lento", "su", "giù", "sinistra", "destra", "giorno", "notte", "luce",
    "scuro", "giovane", "vecchio", "uomo", "donna", "ragazzo", "ragazza", "amico", "nemico",
    "canzone", "danza", "salta", "corri", "cammina", "ferma", "vai", "vieni", "spingi",
    "tira", "apri", "chiudi", "duro", "morbido", "corto", "alto", "largo", "stretto"
]


<h3 style="color:#FF7C00  ">  Preprocessing </h3>

In [None]:
tokenizer_eng = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer_eng.fit_on_texts(english_sentences)
english_seq = tokenizer_eng.texts_to_sequences(english_sentences)

tokenizer_frn = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer_frn.fit_on_texts(italian_sentences)
french_seq = tokenizer_frn.texts_to_sequences(italian_sentences)

max_len_eng = max([len(seq) for seq in english_seq])
max_len_frn = max([len(seq) for seq in french_seq])

english_seq = keras.preprocessing.sequence.pad_sequences(english_seq, maxlen=max_len_eng, padding='post')
french_seq = keras.preprocessing.sequence.pad_sequences(french_seq, maxlen=max_len_frn, padding='post')


<h3 style="color:#FF7C00  ">  Seq2Seq Model </h3>

In [None]:
#### Parameters
embedding_dim = 50
lstm_units = 128
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_frn = len(tokenizer_frn.word_index) + 1

In [None]:
###### Encoder
encoder_input = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size_eng, embedding_dim)(encoder_input)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_output, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding)
encoder_states = [encoder_state_h, encoder_state_c]

####### Decoder
decoder_input = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size_frn, embedding_dim)(decoder_input)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_frn, activation='softmax')
decoder_output = decoder_dense(decoder_output)

model = keras.Model([encoder_input, decoder_input], decoder_output)

In [None]:
# Encoder inference model
encoder_model = keras.Model(encoder_input, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_inference = Embedding(vocab_size_frn, embedding_dim)(decoder_input)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding_inference, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = keras.Model([decoder_input] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
""" Training """

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Splitting data for training and testing
eng_train, eng_val, frn_train, frn_val = train_test_split(english_seq, french_seq, test_size=0.2)

model.fit([eng_train, frn_train[:, :-1]], frn_train[:, 1:], 
            validation_data=([eng_val, frn_val[:, :-1]], frn_val[:, 1:]),
            batch_size=2, epochs=100)

In [None]:
def translate(input_sentence):
    # Tokenize and pad the input sentence
    input_seq = tokenizer_eng.texts_to_sequences([input_sentence])
    input_seq = keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len_eng, padding='post')

    # Get the encoder states
    states_value = encoder_model.predict(input_seq)

    # Start token for the decoder, using the first word in our dictionary as the start point
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = 1  # Let's use the first word index as a starting point

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        
        # Check if the index is in the dictionary
        if sampled_token_index in tokenizer_frn.index_word:
            sampled_char = tokenizer_frn.index_word[sampled_token_index]
            decoded_sentence += ' ' + sampled_char
        else:
            break  # Exit if the index isn't in the dictionary

        # Exit loop if max length is reached
        if len(decoded_sentence.split()) > max_len_frn:
            stop_condition = True

        # Update the target_seq and states for the next loop iteration
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()


In [None]:
input_sentence = "hello"
predicted_translation = translate(input_sentence)
predicted_translation

<div style="line-height:0.5">
<h3 style="color:#FF7C00  "> Note: </h3>
</div>
Clearly, the prediction is wrong! Seq2Seq models typically require large datasets to produce accurate translations.

<h2 style="color:#FF7C00  ">  <u> Example 2 </u> </h2>

In [None]:
english_sentences = []
italian_sentences = []

with open("./data/eng-ita.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    for line in lines:
        eng, ita = line.strip().split("\t")
        english_sentences.append(eng)
        italian_sentences.append(ita)


In [None]:
""" Use only 4% of the data to avoid memory allocation problems. """

num_samples = int(0.04 * len(english_sentences))  
# Randomly sample indices
sampled_indices = random.sample(range(len(english_sentences)), num_samples)

## Use indices to sample from lists
english_sentences_sampled = [english_sentences[i] for i in sampled_indices]
italian_sentences_sampled = [italian_sentences[i] for i in sampled_indices]

In [None]:
# Tokenization using TensorFlow's Keras API
tokenizer_eng = tf.keras.preprocessing.text.Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences_sampled)
english_sequences = tokenizer_eng.texts_to_sequences(english_sentences_sampled)
vocab_size_eng = len(tokenizer_eng.word_index) + 1
max_len_eng = max([len(seq) for seq in english_sequences])

tokenizer_frn = tf.keras.preprocessing.text.Tokenizer()
tokenizer_frn.fit_on_texts(italian_sentences_sampled)
italian_sequences = tokenizer_frn.texts_to_sequences(italian_sentences_sampled)
vocab_size_frn = len(tokenizer_frn.word_index) + 1
max_len_frn = max([len(seq) for seq in italian_sequences])

# Padding sequences
english_sequences = tf.keras.preprocessing.sequence.pad_sequences(english_sequences, maxlen=max_len_eng, padding='post')
italian_sequences = tf.keras.preprocessing.sequence.pad_sequences(italian_sequences, maxlen=max_len_frn, padding='post')


In [None]:
model.fit([english_sequences, italian_sequences[:,:-1]], keras.utils.to_categorical(italian_sequences[:,1:], num_classes=vocab_size_frn), 
                                                                                    batch_size=64, 
                                                                                    epochs=100, 
                                                                                    validation_split=0.2)