# Machine Translation -  English to Spanish - Seq2Seq


#### Here we aim to create a english to spanish translation machine using encoder decoder models.

#### Data credits: http://www.manythings.org/anki/spa-eng.zip

In [None]:
#importing libraries
import numpy as np
import pandas as pd
import spacy

In [None]:
NUM_SAMPLES = 10000 #Regulating the training set for faster training
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 20
EMBEDDING_DIM = 300
LSTM_UNITS = 256

### Importing text

In [None]:
path_spa = "spa.txt"

english_text = [] #used in encoder
spanish_text_in = [] #used in decoder
spanish_text_out = [] #used in decoder

In [None]:
f = open(path_spa)
i = 0
for line in f:
    i+=1
    if(i>NUM_SAMPLES):
        break
    if '\t' in line:
        eng,spa,_ = line.split('\t')
        english_text.append(eng)
        spanish_text_in.append("<sos> "+spa)
        spanish_text_out.append(spa+" <eos>")

f.close()

**Printing some of the the appended texts**

In [None]:
english_text[:3]

In [None]:
spanish_text_in[:3]

In [None]:
spanish_text_out[:3]

### Tokenizing the texts and Padding them

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

**Tokenizer for encoder**

In [None]:
tokenizer_encoder = Tokenizer(num_words = MAX_VOCAB_SIZE)
tokenizer_encoder.fit_on_texts(english_text)
english_text = tokenizer_encoder.texts_to_sequences(english_text)

**Finding the maximum sequence length in english_text**

In [None]:
max_seq_len_text = max(len(s) for s in english_text)
max_seq_len_text 

In [None]:
max_seq_len_eng = min(max_seq_len_text,MAX_SEQ_LEN)
max_seq_len_eng

**Padding**

In [None]:
english_text = pad_sequences(english_text,maxlen=max_seq_len_eng,padding='pre')
english_text[:3]

In [None]:
encoder_vocab_size = min(MAX_VOCAB_SIZE,len(tokenizer_encoder.word_index)+1)
encoder_vocab_size

**Tokenizer for decoder**

In [None]:
tokenizer_decoder = Tokenizer(num_words=MAX_VOCAB_SIZE,filters='')
tokenizer_decoder.fit_on_texts(["<eos>"] + spanish_text_in)
spanish_text_in = tokenizer_decoder.texts_to_sequences(spanish_text_in)
spanish_text_out = tokenizer_decoder.texts_to_sequences(spanish_text_out)

**Finding the maximum sequence length in spanish_text**

In [None]:
max_seq_len_text = max(len(s) for s in spanish_text_in)
max_seq_len_text 

In [None]:
max_seq_len_spa = min(max_seq_len_text,MAX_SEQ_LEN)
max_seq_len_spa 

**Padding**

In [None]:
spanish_text_in = pad_sequences(spanish_text_in,maxlen=max_seq_len_spa,padding="post")
spanish_text_out = pad_sequences(spanish_text_out,maxlen=max_seq_len_spa,padding="post")

In [None]:
spanish_text_out

In [None]:
decoder_vocab_size = min(MAX_VOCAB_SIZE,len(tokenizer_decoder.word_index)+1)
decoder_vocab_size

### Creating Embedding matrix for Encoder (English Text)

In [None]:
eng_word_index = tokenizer_encoder.word_index
eng_word_index

In [None]:
spa_word_index = tokenizer_decoder.word_index
spa_word_index

In [None]:
embedding_matrix = np.zeros((encoder_vocab_size,EMBEDDING_DIM))

In [None]:
#loading spacy model for english language
nlp = spacy.load("en_core_web_lg")

In [None]:
#generating embedding matrix 
for word,index in eng_word_index.items():
    if(index < encoder_vocab_size):
        vector = nlp(word).vector
        embedding_matrix[index] = vector

### One hot encoding the target sequence - spanish_text_out

In [None]:
ohe_spanish_text_out = np.zeros((len(spanish_text_out),max_seq_len_spa,decoder_vocab_size))

for seq_i, seq in enumerate(spanish_text_out):
    for word_i,word in enumerate(seq):
        if(word>0):
            ohe_spanish_text_out[seq_i,word_i,word] = 1

### Creating the training model

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM,Dense,Input,Embedding
from tensorflow.keras.utils import plot_model

**Encoder**

In [None]:
encoder_input = Input(shape=(max_seq_len_eng))

In [None]:
x = Embedding(encoder_vocab_size,EMBEDDING_DIM,weights=[embedding_matrix],input_length = max_seq_len_eng)(encoder_input)

In [None]:
lstm_enc = LSTM(LSTM_UNITS,activation='relu',return_state=True)

In [None]:
enc_out,h,c = lstm_enc(x)

In [None]:
encoder_states = [h,c]

**Decoder**

In [None]:
decoder_input = Input(shape=(max_seq_len_spa))

In [None]:
dec_embedding = Embedding(decoder_vocab_size,EMBEDDING_DIM,input_length=max_seq_len_spa)
x = dec_embedding(decoder_input)

In [None]:
lstm_dec = LSTM(LSTM_UNITS,activation='relu',return_state=True)

In [None]:
x,_,_ = lstm_dec(x,initial_state=encoder_states)

In [None]:
dense_dec = Dense(decoder_vocab_size,activation="softmax")

In [None]:
dec_out = dense_dec(x)

In [None]:
model = Model([encoder_input,decoder_input],dec_out)

In [None]:
model.summary()

In [None]:
plot_model(model)

In [None]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
#model.fit([english_text,spanish_text_in],ohe_spanish_text_out)

### Creating the sampling model

**Encoder**

In [None]:
s_enc_model = Model(encoder_input,encoder_states)

In [None]:
plot_model(s_enc_model)

**Decoder**

In [None]:
s_decoder_input = Input(shape=(1))
s_decoder_input_h = Input(shape=(LSTM_UNITS))
s_decoder_input_c = Input(shape=(LSTM_UNITS))

s_decoder_states = [s_decoder_input_h,s_decoder_input_c]

In [None]:
x = dec_embedding(s_decoder_input)

In [None]:
x,h,c = lstm_dec(x,initial_state=s_decoder_states)

In [None]:
s_decoder_out = dense_dec(x)

In [None]:
s_dec_model = Model(inputs=[s_decoder_input]+s_decoder_states,outputs=[s_decoder_out] + [h,c])

#### Defining the translate function

In [None]:
#reverse indexing required to convert token indices to words
spa_index_word = { s:v for v,s in spa_word_index.items()}

In [None]:
def translate(english_sentence):
    output_spa = []
    
    x = tokenizer_encoder.texts_to_sequences([english_sentence])
    x = pad_sequences(x,maxlen=max_seq_len_eng)
    
    h,c = s_enc_model.predict(x)
    next_states = [h,c]
    
    next_word = np.array([spa_word_index['<sos>']])
    
    for i in range(max_seq_len_spa):
        next_word, h,c = s_dec_model.predict([next_word] + next_states)
        
        next_word = np.array([np.argmax(next_word)])
        next_states = [h,c]
        
        append_word = spa_index_word[next_word[0]]
        if(append_word == "<eos>"):
            break
            
        output_spa.append(append_word)
        
    return " ".join(output_spa)

In [None]:
translate("hi good morning")