In [1]:
#Packages
import pandas as pd
import re
import string
from string import digits
import numpy as np
from sklearn.utils import shuffle
from keras.layers import Input, LSTM, Embedding, Dense,Dropout,TimeDistributed
from keras.models import Model
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [5]:
lines= pd.read_table('hindi_copy.txt', names =['source', 'target','comments'])
lines.head()

Unnamed: 0,source,target
General,सामान्य,
"Any recent chills, night sweats or fever? किसी भी हाल में ठंड लगना, रात को पसीना या बुखार?",,
Have you gained or lost weight without trying?,क्या आपने बिना प्रयास किए वजन घटाया या प्राप्त...,
Have you been tired?,क्या आप थक गए हैं?,
Skin,त्वचा,


In [6]:
# convert source and target text to Lowercase 
lines.source=lines.source.apply(lambda x: x.lower())
lines.target=lines.target.apply(lambda x: x.lower())# Remove quotes from source and target text
lines.source=lines.source.apply(lambda x: re.sub("'", '', x))
lines.target=lines.target.apply(lambda x: re.sub("'", '', x))# create a set of all special characters
special_characters= set(string.punctuation)# Remove all the special characters
lines.source = lines.source.apply(lambda x: ''.join(char1 for char1 in x if char1 not in special_characters))
lines.target = lines.target.apply(lambda x: ''.join(char1 for char1 in x if char1 not in special_characters))# Remove digits from source and target sentences
num_digits= str.maketrans('','', digits)
lines.source=lines.source.apply(lambda x: x.translate(num_digits))
lines.target= lines.target.apply(lambda x: x.translate(num_digits))# Remove extra spaces
lines.source=lines.source.apply(lambda x: x.strip())
lines.target=lines.target.apply(lambda x: x.strip())
lines.source=lines.source.apply(lambda x: re.sub(" +", " ", x))
lines.target=lines.target.apply(lambda x: re.sub(" +", " ", x))

AttributeError: 'float' object has no attribute 'lower'

In [83]:
# Add start and end tokens to target sequences
lines.target = lines.target.apply(lambda x : 'START_ '+ x + ' _END')
lines.sample(6)

Unnamed: 0,source,target,comments
5380,have a good time,START_ START_ START_ मजा कर _END _END _END,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
35262,can you play the guitar yes i can,START_ START_ START_ तुला गिटार वाजवता येते का...,CC-BY 2.0 (France) Attribution: tatoeba.org #7...
35507,the french flag is blue white and red,START_ START_ START_ फ्रेंच झेंडा निळा पांढरा ...,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
32487,his sister is not going to america,START_ START_ START_ त्याची बहीण अमेरिकेला जात...,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
23684,how many were on the plane,START_ START_ START_ विमानावर किती जण होते _EN...,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1263,can you read,START_ START_ START_ तुम्हाला वाचता येतं का _E...,CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [84]:
# Find all the source and target words and sort them
# Vocabulary of Source language
all_source_words=set()
for source in lines.source:
    for word in source.split():
        if word not in all_source_words:
            all_source_words.add(word)# Vocabulary of Target 
all_target_words=set()
for target in lines.target:
    for word in target.split():
        if word not in all_target_words:
            all_target_words.add(word)
# sort all unique source and target words
source_words= sorted(list(all_source_words))
target_words=sorted(list(all_target_words))

In [85]:
#Find maximum sentence length in the source and target data
source_length_list=[]
for l in lines.source:
 source_length_list.append(len(l.split(' ')))
max_source_length= max(source_length_list)
print("Max length of the source sentence",max_source_length)
target_length_list=[]
for l in lines.target:
 target_length_list.append(len(l.split(' ')))
max_target_length= max(target_length_list)
print("Max length of the target sentence",max_target_length)

Max length of the source sentence 34
Max length of the target sentence 41


In [86]:
# creating a word to index(word2idx) for source and target
source_word2idx= dict([(word, i+1) for i,word in enumerate(source_words)])
target_word2idx=dict([(word, i+1) for i, word in enumerate(target_words)])

In [87]:
#creating a dictionary for index to word for source and target vocabulary
source_idx2word= dict([(i, word) for word, i in source_word2idx.items()])
print(source_idx2word)
target_idx2word =dict([(i, word) for word, i in target_word2idx.items()])



In [88]:
#Shuffle the data
lines = shuffle(lines)

In [89]:
# Train - Test Split
X, y = lines.source, lines.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, X_test.shape

((34826,), (3870,))

In [90]:
# Input tokens for encoder
num_encoder_tokens=len(source_words)# Input tokens for decoder zero padded
num_decoder_tokens=len(target_words) +1

In [91]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
 ''' Generate a batch of data '''
 while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_source_length),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_target_length),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_target_length, num_decoder_tokens),dtype='float32')
        for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
            for t, word in enumerate(input_text.split()):
                encoder_input_data[i, t] = source_word2idx[word] 
            for t, word in enumerate(target_text.split()):
                if t<len(target_text.split())-1:
                    decoder_input_data[i, t] = target_word2idx[word] # decoder input seq
                    if t>0:
 # decoder target sequence (one hot encoded)
 # does not include the START_ token
 # Offset by one timestep
 #print(word)
                        decoder_target_data[i, t - 1, target_word2idx[word]] = 1.
        yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [92]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50
latent_dim=256

In [94]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [95]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
 initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [96]:
# Encode the input sequence to get the "Context vectors"
encoder_model = Model(encoder_inputs, encoder_states)# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs)# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_state_input)
decoder_states2 = [state_h2, state_c2]
# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)# Final decoder model
decoder_model = Model(
 [decoder_inputs] + decoder_state_input,
 [decoder_outputs2] + decoder_states2)

In [108]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of 
    #target sequence with the start character.
    target_seq[0, 0] = target_word2idx['START_']
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word =target_idx2word[sampled_token_index]
        decoded_sentence += ' '+ sampled_word# Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '_END' or
            len(decoded_sentence) > 50):
            stop_condition = True# Update the target sequence (of length 1).
            target_seq = np.zeros((1,1))
            target_seq[0, 0] = sampled_token_index# Update states
            states_value = [h, c]
    return decoded_sentence

In [106]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence as per data:', X_train[k:k+1].values[0])
print('Actual Marathi Translation as per data:', y_train[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation predicted by model:', decoded_sentence[:-4])

Input English sentence as per data: whats going on today
Actual Marathi Translation as per data:  START_ START_ आज काय चाललंय _END _END 
Predicted Marathi Translation predicted by model:  बुद्धी बुद्धी बुद्धी बुद्धी बुद्धी बुद्धी बुद्धी बु


In [107]:
test_gen = generate_batch(X_test, y_test, batch_size = 1)
k=10
k+=1
(input_seq, actual_output), _ = next(test_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Source sentence:', X_test[k:k+1].values[0])
print('Actual Target Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Target Translation:', decoded_sentence[:-4])

Input Source sentence: we can paint your room any color you want
Actual Target Translation:  START_ START_ आम्ही तुझ्या खोलीला हवा तो रंग मारू शकतो _END _END 
Predicted Target Translation:  तीसपर्यंत तीसपर्यंत तीसपर्यंत तीसपर्यंत तीसपर्यंत तीसपर
