# English to German Language Converter using encoder and decoder model using LSTM

---
This model is trained on google colab with GPU suport.
---

In [2]:
import tensorflow as tf
from pathlib import Path
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip",origin=url,cache_dir="datasets",extract=True)
text = (Path(path).with_name('spa-eng')/'spa.txt').read_text()

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


Each line contains an English sentence and the corresponding Spanish translation,
separated by a tab. We’ll start by removing the Spanish characters “¡” and “¿”, which
the TextVectorization layer doesn’t handle, then we will parse the sentence pairs
and shuffle them. Finally, we will split them into two separate lists, one per language

In [3]:
import numpy as np
text = text.replace("¡", "").replace("¿", "") # removing the special characters
pairs = [line.split("\t") for line in text.splitlines()] #splitting into 2 sep list
np.random.shuffle(pairs) # shuffling the lists while maintaining the pair up order
sentences_en, sentences_es = zip(*pairs) # naming them with different list

In [4]:
# we demonstrate by pairing up the input and the target i.e., eng and spanish
for i in list(zip(sentences_en[:3],sentences_es[:3])):
    print(i[0],'=>',i[1])

The newcomers were quickly absorbed into the community. => Los recién llegados fueron rápidamente absorbidos a la comunidad.
We have just a few more questions. => Tenemos sólo un par de preguntas más.
I don't use it. => No lo uso.


In [5]:
# creating two TextVectorization layers one for each language
vocab_size = 1000
max_length = 50
text_vec_layer_en = tf.keras.layers.TextVectorization(
     vocab_size,output_sequence_length=max_length
)
text_vec_layer_es = tf.keras.layers.TextVectorization(
     vocab_size, output_sequence_length = max_length
)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])
# For the Spanish text, we add “startofseq” and “endofseq” to each sentence when
# adapting the TextVectorization layer: we will use these words as SOS and EOS
# tokens. You could use any other words, as long as they are not actual Spanish
# words

In [6]:
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [7]:
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [8]:
# creating training and validation set.
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])

# The decoder’s inputs are the Spanish sentences plus an SOS token prefix
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])

# The targets are the Spanish sentences plus an EOS suffix
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])

In [9]:
# building decoder-encoder model using functional API


encoder_inputs = tf.keras.layers.Input(shape=[],dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[],dtype=tf.string)
#text vectorization of input layers
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,embed_size,mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,embed_size,mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)


In [10]:
# creating the encoder and passing the embedded inputs

encoder = tf.keras.layers.LSTM(512,return_state=True)
# we set return_state = True so as to get a reference to the layers
# final state
encoder_outputs,*encoder_state = encoder(encoder_embeddings)
# The layer returns these states separately, which is why we had to
# write *encoder_state to group both states in a list

In [11]:
# creating decoder
# here we can use the double state (i.e., encoder_state) as the initial state of
# the decoder
decoder = tf.keras.layers.LSTM(512,return_sequences=True)
decoder_outputs = decoder(decoder_embeddings,initial_state=encoder_state)


In [12]:
# generating the final output by as usual passing the decoder output
# through a dense layer with softmax activation

output_layer = tf.keras.layers.Dense(vocab_size,activation='softmax')
Y_proba = output_layer(decoder_outputs)


In [13]:
# compiling and fitting
model = tf.keras.Model(inputs = [encoder_inputs,decoder_inputs], outputs = [Y_proba])
model.compile(loss='sparse_categorical_crossentropy',optimizer='nadam',metrics=['accuracy'])
model.fit((X_train,X_train_dec),Y_train,epochs=10,validation_data=((X_valid,X_valid_dec),Y_valid))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7eecfc0c3a00>

In [14]:
model.save("/content/drive/MyDrive/modelsML/eng_to_germ", save_format='tf')

In [2]:
import tensorflow as tf


2024-02-22 13:31:28.515687: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-22 13:31:28.616890: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-22 13:31:28.617023: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-22 13:31:28.619554: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-22 13:31:28.635922: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-22 13:31:28.637482: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [5]:
from tensorflow.keras.models import load_model

# Load the model
model = load_model("eng_to_germ")

# Use the model for prediction, evaluation, etc.


2024-02-22 13:39:20.588181: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 42 outputs. Output shapes may be inaccurate.
2024-02-22 13:39:20.624246: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 42 outputs. Output shapes may be inaccurate.
2024-02-22 13:39:20.650465: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 42 outputs. Output shapes may be inaccurate.
2024-02-22 13:39:20.720140: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 42 outputs. Output shapes may be inaccurate.
2024-02-22 13:39:20.955063: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_sh

In [10]:
max_length = 50
import numpy as np
def translate(sentence_en):
    translation = ''
    for word_idx in range(max_length):
        X = np.array([sentence_en]) # encoder input
        X_dec = np.array(['startofseq'+translation])# decoder input
        y_proba = model.predict((X,X_dec))[0,word_idx] # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == 'endofseq':
            break
        translation += ' '+predicted_word
    return translation.strip()

In [11]:
translate("my name is my name not of your name")



NameError: name 'text_vec_layer_es' is not defined