## Machine Translation using Attention models and GloVe (English to German)


**Objective: We aim to create a model for translating english to german language**


Data credits: http://www.manythings.org/anki/deu-eng.zip <br>
GloVe: https://nlp.stanford.edu/data/glove.6B.zip

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys

In [None]:
NUM_SAMPLES = 10000 #To limit the number of training sentences for faster training
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 20
NUM_WORDS = 10000
EMBEDDING_DIM = 100
LSTM_UNITS_ENC = 256
LSTM_UNITS_DEC = 256

**Define lists to store the data**

In [None]:
english_texts = []
german_texts_in = []
german_texts_out = []

**Read the data**

In [None]:
samples = 0
with open('deu.txt') as f:
    for line in f:
        samples+=1
        if(samples<=NUM_SAMPLES):
            eng,spa,_= line.split('\t')
            english_texts.append(eng)
            german_texts_in.append("<sos> "+spa)
            german_texts_out.append(spa+" <eos>")

**Tokenizer for English and sequence padding**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
eng_tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE) #keep the filters for english language
eng_tokenizer.fit_on_texts(english_texts)
english_texts = eng_tokenizer.texts_to_sequences(english_texts)

In [None]:
num_eng_words = min(len(eng_tokenizer.word_index) + 1, NUM_WORDS)

In [None]:
eng_max_seq_length = min(MAX_SEQ_LEN,max(len(s) for s in english_texts))

In [None]:
english_texts = pad_sequences(english_texts,eng_max_seq_length)

**Tokenizer for German and sequence padding**

In [None]:
ger_tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE,filters='') #keep the filters for english language
ger_tokenizer.fit_on_texts(german_texts_in + [" <eos>"])
german_texts_in = ger_tokenizer.texts_to_sequences(german_texts_in)
german_texts_out = ger_tokenizer.texts_to_sequences(german_texts_out)

In [None]:
num_ger_words = min(len(ger_tokenizer.word_index) + 1, NUM_WORDS)

In [None]:
ger_max_seq_length = min(MAX_SEQ_LEN,max(len(s) for s in german_texts_in))

In [None]:
german_texts_in = pad_sequences(german_texts_in,ger_max_seq_length,padding='post')
german_texts_out = pad_sequences(german_texts_out,ger_max_seq_length,padding='post')

**Loading GloVe**

In [None]:
glove = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        value = line.split()
        glove[value[0]] = np.asarray(value[1:])

**Creating embedding matrix for english language**

In [None]:
embedding_matrix = np.zeros((num_eng_words,EMBEDDING_DIM))

In [None]:
for word,index in eng_tokenizer.word_index.items():
    try:
        embedding_matrix[index] = glove[word]
    except:
        pass

**One hot encoding the outputs**

In [None]:
one_hot_ger = np.zeros((NUM_SAMPLES,ger_max_seq_length,num_ger_words))

In [None]:
for i,sample in enumerate(german_texts_out):
    for j,word in enumerate(sample):
        one_hot_ger[i,j,word] = 1

**Creating attention model**

In [None]:
from tensorflow.keras.layers import LSTM,Dense,Concatenate,Dot,Input,Embedding,RepeatVector,Lambda,Bidirectional
from tensorflow.keras.activations import softmax

**Encoder**

In [None]:
enc_input = Input(shape=(eng_max_seq_length))

In [None]:
embedding_enc = Embedding(num_eng_words,EMBEDDING_DIM,input_length=eng_max_seq_length,weights = [embedding_matrix])

In [None]:
x = embedding_enc(enc_input)

In [None]:
enc_lstm = Bidirectional(LSTM(LSTM_UNITS_ENC,return_sequences = True,dropout = 0.5))

In [None]:
h = enc_lstm(x)

**Decoder**

In [None]:
dec_input = Input(shape=(ger_max_seq_length))

In [None]:
dec_embedding = Embedding(num_ger_words,EMBEDDING_DIM)

In [None]:
dec_after_embedding = dec_embedding(dec_input)

**Attention**

In [None]:
repeat_vector = RepeatVector(eng_max_seq_length)

In [None]:
concat_layer_alpha = Concatenate(axis=-1)

In [None]:
dense_alpha_1 = Dense(10,activation='tanh')

In [None]:
dense_alpha_2 = Dense(1,activation=lambda x: softmax(x,axis=-1))

In [None]:
dot = Dot(axes=1)

In [None]:
def attention_once(s,h):

    x = repeat_vector(s)
    x = concat_layer_alpha([x,h])
    x = dense_alpha_1(x)
    alpha = dense_alpha_2(x)
    context = dot([alpha,h])
    
    return context

**Teacher forcing and wiring the layers together**

In [None]:
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

In [None]:
input_s = Input(shape=(LSTM_UNITS_DEC))
input_c = Input(shape=(LSTM_UNITS_DEC))

In [None]:
concat_context_word = Concatenate(axis=-1)

In [None]:
dec_lstm = LSTM(LSTM_UNITS_DEC,return_state = True)

In [None]:
dec_dense = Dense(num_ger_words,activation="softmax")

In [None]:
outputs = []
s = input_s
c = input_c
for t in range(ger_max_seq_length):
    context_vector = attention_once(s,h)
    word_choose = Lambda(lambda x: x[:,t:t+1])
    word_input = word_choose(dec_after_embedding)
    concat_dec_input = concat_context_word([context_vector,word_input])
    dec_out, s,c = dec_lstm(concat_dec_input,initial_state=[s,c])
    dec_dense_out = dec_dense(dec_out)
    outputs.append(dec_dense_out)

In [None]:
outputs

**Reshaping the output layer: (ger_max_seq_length,None,num_ger_words) to (None,ger_max_seq_length,num_ger_words)**

In [None]:
def fix_outputs_shape(outputs):
    outputs = K.stack(outputs)
    outputs = K.permute_dimensions(outputs,(1,0,2))
    return outputs

In [None]:
shape_fixer = Lambda(fix_outputs_shape)

In [None]:
dec_output = shape_fixer(outputs)

In [None]:
model = Model([enc_input,dec_input,input_s,input_c],dec_output)

In [None]:
model.summary()

In [None]:
plot_model(model)

**Compiling and fitting the model**

In [None]:
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics = ['accuracy'])

In [None]:
initial_s = np.zeros([NUM_SAMPLES,LSTM_UNITS_DEC])
initial_c = np.zeros([NUM_SAMPLES,LSTM_UNITS_DEC])

In [None]:
model.fit([english_texts,german_texts_in,initial_s,initial_c],one_hot_ger)

### Creating the sampling model

**Encoder**

In [None]:
encoder_model = Model(enc_input,h)

**Decoder**

In [None]:
decoder_input = Input(shape=(1))
decoder_embedding_out = dec_embedding(decoder_input)

In [None]:
encoder_out_to_attention = Input(shape=(eng_max_seq_length,2*LSTM_UNITS_ENC))

In [None]:
context = attention_once(input_s,encoder_out_to_attention)

In [None]:
decoder_inputs_single = concat_context_word([context,decoder_embedding_out])

In [None]:
decoder_lstm_out,s,c = dec_lstm(decoder_inputs_single,initial_state=[input_s,input_c])

In [None]:
decoder_dense_out = dec_dense(decoder_lstm_out)

In [None]:
decoder_model = Model([encoder_out_to_attention,decoder_input,input_s,input_c],[decoder_dense_out,s,c])

In [None]:
index2word_ger = {s:v for v,s in ger_tokenizer.word_index.items()}
word2index = {s:v for v,s in ger_tokenizer.word_index.items()}

**Predict function**

In [None]:
def predict(input_seq):
    input_seq = eng_tokenizer.texts_to_sequences([input_seq])
    input_seq = pad_sequences(input_seq,eng_max_seq_length)
    
    predictions = []
    
    s = np.zeros((1,LSTM_UNITS_DEC))
    c = np.zeros((1,LSTM_UNITS_DEC))
    
    encoder_out = encoder_model.predict(input_seq)
    next_word = np.array([[ger_tokenizer.word_index["<sos>"]]])
    
    for t in range(ger_max_seq_length):
        decoder_out,s,c = decoder_model.predict([encoder_out,next_word,s,c])
        
        next_word = np.argmax(decoder_out[0])
           
        if(next_word == ger_tokenizer.word_index["<eos>"]):
            break
        else:
            word = index2word_ger[next_word]
            predictions.append(word)
            next_word = np.array([[next_word]])
    
    return ' '.join(predictions)
    

In [None]:
predict("hey good morning")