# Word Level English to Hindi Machine Translation using Encoder-Decoder LSTMs

This is the working implementation of Natural Machine Translation Project

In [1]:
import pandas as pd
import numpy as np

from keras.activations import softmax
from keras.layers import Bidirectional, Concatenate, Dot, Reshape, RepeatVector, Lambda
from keras.layers import Embedding, Dropout
# TODO check whether gpu is available with keras.backend.tensorflow_backend._get_available_gpus()
from keras.layers import Input, Dense, CuDNNLSTM as LSTM
from keras.models import Model
from keras.utils.vis_utils import plot_model

#data preprocessing imports
from nltk.corpus import IndianCorpusReader
from keras.preprocessing.text import text_to_word_sequence
from collections import Counter, OrderedDict, defaultdict

from keras.losses import sparse_categorical_crossentropy
from keras.metrics import sparse_categorical_accuracy
import keras.backend as K
from keras.callbacks import CSVLogger

Using TensorFlow backend.


Set CPU and GPU for optimal usage

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

Read the Parallel corpus provided by IIT-Bombay.
http://www.cfilt.iitb.ac.in/iitb_parallel/

In [3]:
en_file = open("/home/users/akumar/data/IITB.en-hi.en", "r")
en_lines = en_file.readlines()

hi_file = IndianCorpusReader(root = "/home/users/akumar/data", fileids = "IITB.en-hi.hi")
hi_line = hi_file.sents()

Choose sentences with length less than or equal to 65 first in English corpus followed by Hindi Corpus.

In [4]:
en_line = [text_to_word_sequence(x.rstrip("\n")) for x in  en_lines]

en_sent_length = [len(line) for line in en_line]
en_sent_length = np.array(en_sent_length)

ind = np.where(en_sent_length <= 65)[0]

en_line_100 = [en_line[i] for i in ind]
hi_line_100 = [hi_line[i] for i in ind]

hi_sent_length = [len(line) for line in hi_line_100]
hi_sent_length = np.array(hi_sent_length)

hi_ind = np.where(hi_sent_length <= 65)[0]

en_line = [en_line_100[i] for i in hi_ind]
hi_line = [hi_line_100[i] for i in hi_ind]

Add Prefix and suffix for the target sentences which will act as a pointer while inferring the model.

In [5]:
[line.insert(0,"START_") for line in hi_line]
[line.append("_END") for line in hi_line]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

Split train and test data.

In [6]:
# split test and train data sets

test_ind = np.random.choice(len(en_line), int(0.05*len(en_line)), replace=False)

train_ind = set(np.arange(0,len(en_line))) - set(test_ind)

en_test = [en_line[i] for i in test_ind]
hi_test = [hi_line[i] for i in test_ind]

en_train = [en_line[i] for i in train_ind]
hi_train = [hi_line[i] for i in train_ind]

del(en_line, hi_line)

Gathering only the most frequently occurring 80000 words in both the corpus.

In [7]:
# considering only most common 80k words for weights both in english and hindi

hi_tokens = [item for sublist in hi_train for item in sublist]
en_tokens = [item for sublist in en_train for item in sublist]

hi_vocab = list(OrderedDict(Counter(hi_tokens).most_common(80000)).keys())
en_vocab = list(OrderedDict(Counter(en_tokens).most_common(80000)).keys())

Reading the downloaded pretrained embedding for both English and Hindi.
https://fasttext.cc/docs/en/english-vectors.html

In [8]:
# prepare pre trained embeddings
def read_embeddings(file_path):
    file = open(file_path, encoding="utf-8")
    lines = file.readlines()
    embeddings_index = {}
    for line in lines:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    file.close()
    return(embeddings_index)

In [9]:
# read english embeddings
en_embed = read_embeddings("/home/users/akumar/data/embeddings/cc.en.300.vec")

# read hindi embeddings
hi_embed = read_embeddings("/home/users/akumar/data/embeddings/cc.hi.300.vec")

Convert the embedding vectors to a matrix representation

In [10]:
en_embedding_matrix = np.zeros((len(en_vocab)+1, 300))

for word in en_vocab:
    i = 1
    embedding_vector = en_embed.get(word)
    if embedding_vector is not None:
        en_embedding_matrix[i] = embedding_vector
    i += 1

In [11]:
hi_embedding_matrix = np.zeros((len(hi_vocab)+1, 300))

for word in hi_vocab:
    i = 1
    embedding_vector = hi_embed.get(word)
    if embedding_vector is not None:
        hi_embedding_matrix[i] = embedding_vector
    i += 1

Calculating the maximum sentence length in both the corpora(should be around 65).

In [12]:
lenght_list=[]
for l in en_train:
    lenght_list.append(len(l))
max_length_en = np.max(lenght_list)

# Max Length of target sequence
lenght_list=[]
for l in hi_train:
    lenght_list.append(len(l))
max_length_hi = np.max(lenght_list)

# Calculate Vocab size for both source and target
num_encoder_tokens = len(en_vocab)+1
num_decoder_tokens = len(hi_vocab)+1

Preparing index representation data for training the model. index 0 is given to all those words that are not present in the vocabulary

In [14]:
#preparing data

hi_token_index = defaultdict(int,[(word, i+1) for i, word in enumerate(hi_vocab)])
en_token_index = defaultdict(int,[(word, i+1) for i, word in enumerate(en_vocab)])

In [15]:
encoder_input_data = np.zeros((len(en_train), max_length_en), dtype='int32')
decoder_input_data = np.zeros((len(hi_train), max_length_hi), dtype='int32')
decoder_target_data = np.zeros((len(hi_train), max_length_hi),dtype='int32')

for i, sent in enumerate(en_train):
    for t, word in enumerate(sent):
        encoder_input_data[i, t] = en_token_index[word]
for i, sent in enumerate(hi_train):
    for t, word in enumerate(sent):
        decoder_input_data[i, t] = hi_token_index[word]
        if t > 0:
            decoder_target_data[i, t-1] = hi_token_index[word]

Configuring the Encoder network with latent space as 512, 2 LSTM layers and the english embedding matrix

In [16]:
latent_dim = 512
EMBEDDING_DIM = 300

encoder_inputs = Input(shape=(None,), name="Encoder_input")
encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True, name='Encoder_lstm1') 
encoder_lstm2 = LSTM(latent_dim, return_state=True, name='Encoder_lstm2') 

encoder_embedding = Embedding(num_encoder_tokens,
                            EMBEDDING_DIM,
                            weights=[en_embedding_matrix],
                            trainable=False)(encoder_inputs)
encoder_outputs, state_h1, state_c1 = encoder_lstm1(encoder_embedding)
_,state_h2, state_c2 = encoder_lstm2(encoder_outputs)
encoder_states = [state_h1, state_c1, state_h2, state_c2]


Configuring the Decoder with similar architecture of the encoder followed by a dense soft-max layer.

In [17]:

decoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True, name="Decoder_lstm1")
decoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True, name="Decoder_lstm2")
decoder_inputs = Input(shape=(None,), name="Decoder_input")
decoder_embedding = Embedding(num_decoder_tokens,
                            EMBEDDING_DIM,
                            weights=[hi_embedding_matrix],
                            trainable=False)(decoder_inputs)
decoder_outputs, de_h1, de_c1 = decoder_lstm1(decoder_embedding, initial_state=[state_h1,state_c1]) 
decoder_final_output, de_h2, de_c2 = decoder_lstm2(decoder_outputs, initial_state=[state_h2, state_c2])
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name="Dense_layer") 
decoder_outputs = decoder_dense(decoder_final_output) 


Customizing the loss and accuracy fuctions as the input data is not one-hot encoding but index representation. One-hot encoding consumed to much space which led to ResourceExhausterError

In [19]:
def reshape_outputs(y_true, y_pred, vocab_size= 80001):
    # y_true: (batch_size, max_len)             -> (batch_size*max_len,)
    # y_pred: (batch_size, max_len, vocab_size) -> (batch_size*max_len, vocab_size)
    return K.reshape(y_true, (-1,)), K.reshape(y_pred, (-1, vocab_size))


def loss(y_true, y_pred):
    return K.mean(sparse_categorical_crossentropy(*reshape_outputs(y_true, y_pred)))


def acc(y_true, y_pred):
    return K.mean(sparse_categorical_accuracy(*reshape_outputs(y_true, y_pred)))

In [20]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer = 'rmsprop', loss = loss, metrics = [acc])

Setting up the inference model for later decoding the sequence for extracting the sentence in the target language, Hindi.

In [21]:


encoder_model = Model(encoder_inputs, encoder_states) 

decoder_state_input_h1 = Input(shape=(latent_dim,), name="H_state_input1") 
decoder_state_input_c1 = Input(shape=(latent_dim,), name="C_state_input1") 
decoder_state_input_h2 = Input(shape=(latent_dim,), name="H_state_input2") 
decoder_state_input_c2 = Input(shape=(latent_dim,), name="C_state_input2") 
decoder_state_inputs = [decoder_state_input_h1, decoder_state_input_c1,decoder_state_input_h2, decoder_state_input_c2] 
decoder_outputs1, final_h1, final_c1 = decoder_lstm1(decoder_embedding, initial_state=decoder_state_inputs[:2]) 
decoder_outputs2, final_h2, final_c2 = decoder_lstm2(decoder_outputs1, initial_state=decoder_state_inputs[-2:]) 
de_states = [final_h1, final_c1, final_h2, final_c2] 
de_outputs = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_state_inputs, [de_outputs] + de_states)


Encoder Model Summary

In [22]:
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder_input (InputLayer)   (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 300)         24000300  
_________________________________________________________________
Encoder_lstm1 (CuDNNLSTM)    [(None, None, 512), (None 1667072   
_________________________________________________________________
Encoder_lstm2 (CuDNNLSTM)    [(None, 512), (None, 512) 2101248   
Total params: 27,768,620
Trainable params: 3,768,320
Non-trainable params: 24,000,300
_________________________________________________________________


Decoder Model Summary

In [23]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    24000300    Decoder_input[0][0]              
__________________________________________________________________________________________________
H_state_input1 (InputLayer)     (None, 512)          0                                            
__________________________________________________________________________________________________
C_state_input1 (InputLayer)     (None, 512)          0                                            
__________________________________________________________________________________________________
Decoder_ls

Setting up CSVLogger to save the training and validation metrics. 

In [37]:
csv_logger = CSVLogger('training3.log')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=32,
          epochs=5,
          validation_split=0.20,
          callbacks=[csv_logger])

Train on 1100252 samples, validate on 275063 samples
Epoch 1/5
Epoch 2/5
   5120/1100252 [..............................] - ETA: 3:08:59 - loss: 1.3118 - acc: 0.8318

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 178464/1100252 [===>..........................] - ETA: 2:39:02 - loss: 1.3427 - acc: 0.8292

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 105344/1100252 [=>............................] - ETA: 2:52:08 - loss: 1.3907 - acc: 0.8288

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 4/5
  24896/1100252 [..............................] - ETA: 3:05:31 - loss: 1.3661 - acc: 0.8309

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 198272/1100252 [====>.........................] - ETA: 2:35:37 - loss: 1.3784 - acc: 0.8297

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 5/5


<keras.callbacks.History at 0x7f79c60f7eb8>

Generating a reverse dictionary to get words from indices.

In [24]:
reverse_en_char_index = dict((i, char) for char, i in en_token_index.items())
reverse_hi_char_index = dict((i, char) for char, i in hi_token_index.items())

In [25]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = hi_token_index['START_']

# Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h1, c1, h2, c2 = decoder_model.predict([target_seq] + states_value)

# Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_hi_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

# Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 52):
            stop_condition = True

# Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

# Update states
        states_value = [h1, c1, h2, c2]

    return decoded_sentence

In [42]:
def generate_encoder_seq(sent):
    sent_seq = np.zeros((300),'int32')
    for i, word in enumerate(sent.split(' ')):
        print(word)
        sent_seq[i] = en_token_index.get(word)
    return sent_seq

Testing the model with few random sentences to get an overview of how the learning is.

In [45]:
sentence = "he goes to school"

decode_sequence(generate_encoder_seq(sentence))

he
goes
to
school


' । जाएगा बर्राना / हैं , , ही , ही ही , , , , , , , ,'

In [46]:
sentence = "she is a student"

decode_sequence(generate_encoder_seq(sentence))

she
is
a
student


' । जाएगा बर्राना / हैं , , ही , ही ही , , , , , , , ,'

In [47]:
sentence = "the baby is crying"

decode_sequence(generate_encoder_seq(sentence))

the
baby
is
crying


' हैं अरबॉर्गCity अरबॉर्गCity अरबॉर्गCity अरबॉर्गCity अरबॉर्गCity'

In [51]:
sentence = "he craves for apples"

decode_sequence(generate_encoder_seq(sentence))

he
craves
for
apples


' । जाएगा बर्राना / हैं , , ही , ही ही , , , , , , , ,'