In [1]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

# print(os.listdir("../input"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

# Any results you write to the current directory are saved as output.

  pd.set_option('display.max_colwidth', -1)


In [2]:
lines=pd.read_csv("Hindi_English_Truncated_Corpus.csv",encoding='utf-8')

In [3]:
lines['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

In [4]:
lines=lines[lines['source']=='ted']

In [5]:
lines.head(20)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what needs to be done.,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है ."
1,ted,"I'd like to tell you about one such child,","मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,"
3,ted,what we really mean is that they're bad at not paying attention.,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,So there is some sort of justice,तो वहाँ न्याय है
23,ted,This changed slowly,धीरे धीरे ये सब बदला
26,ted,were being produced.,उत्पन्न नहीं कि जाती थी.
30,ted,"And you can see, this LED is going to glow.","और जैसा आप देख रहे है, ये एल.ई.डी. जल उठेगी।"
32,ted,"to turn on the lights or to bring him a glass of water,","लाईट जलाने के लिए या उनके लिए पानी लाने के लिए,"
35,ted,Can you imagine saying that?,क्या आप ये कल्पना कर सकते है


In [6]:
pd.isnull(lines).sum()

source              0
english_sentence    0
hindi_sentence      0
dtype: int64

In [7]:
lines=lines[~pd.isnull(lines['english_sentence'])]

In [8]:
lines.drop_duplicates(inplace=True)

* ### Let us pick any 25000 rows from the dataset.

In [9]:
lines=lines.sample(n=25000,random_state=42)
lines.shape

(25000, 3)

In [10]:
# Lowercase all characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.lower())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.lower())

In [11]:
# Remove quotes
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [12]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [13]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.strip())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.strip())
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))


In [14]:
# Add start and end tokens to target sequences
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [15]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence
82040,ted,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापिता कौन हैं वह कौन है _END
85038,ted,no keyboard,START_ कोई कुंजीपटल नहीं _END
58018,ted,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ _END
74470,ted,and this particular balloon,START_ और यह खास गुब्बारा _END
122330,ted,and its not as hard as you think integrate climate solutions into all of your innovations,START_ और जितना आपको लगता है यह उतना कठिन नहीं हैअपने सभी नवाचारों में जलवायु समाधान को एकीकृत करें _END


In [16]:
### Get English and Hindi Vocabulary
all_eng_words=set()
for eng in lines['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in lines['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [17]:
len(all_eng_words)

14030

In [18]:
len(all_hindi_words)

17540

In [19]:
lines['length_eng_sentence']=lines['english_sentence'].apply(lambda x:len(x.split(" ")))
lines['length_hin_sentence']=lines['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [20]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
82040,ted,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापिता कौन हैं वह कौन है _END,11,16
85038,ted,no keyboard,START_ कोई कुंजीपटल नहीं _END,2,5
58018,ted,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ _END,7,8
74470,ted,and this particular balloon,START_ और यह खास गुब्बारा _END,4,6
122330,ted,and its not as hard as you think integrate climate solutions into all of your innovations,START_ और जितना आपको लगता है यह उतना कठिन नहीं हैअपने सभी नवाचारों में जलवायु समाधान को एकीकृत करें _END,16,20


In [21]:
lines[lines['length_eng_sentence']>30].shape

(0, 5)

In [22]:
lines=lines[lines['length_eng_sentence']<=20]
lines=lines[lines['length_hin_sentence']<=20]

In [23]:
lines.shape

(24774, 5)

In [24]:
print("maximum length of Hindi Sentence ",max(lines['length_hin_sentence']))
print("maximum length of English Sentence ",max(lines['length_eng_sentence']))

maximum length of Hindi Sentence  20
maximum length of English Sentence  20


In [25]:
max_length_src=max(lines['length_hin_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

In [26]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

(14030, 17540)

In [27]:
# num_decoder_tokens += 1 #for zero padding
num_decoder_tokens = num_decoder_tokens + 1 #for zero padding
num_encoder_tokens = num_encoder_tokens + 1 

In [28]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [29]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [30]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
89012,ted,and how far it is away from its parent star,START_ और यह अपने तारे से कितने दूर है _END,10,10
57842,ted,over children have gone through these night schools,START_ बच्चों से ज्यादा रात को पढ चुके है _END,8,10
107947,ted,think of models of terrorism,START_ सोचें जरा आतंकवाद के नमूने को _END,5,8
127346,ted,wandering the sinai desert,START_ सिनाई मरूभूमि में भटकते हुए गुज़ारा _END,4,8
43334,ted,than just how much light you receive overall,START_ अलावा और भी बाते है जानने के लिए _END,8,10
121466,ted,that transformed the lone nut into a leader,START_ जिसने एक अकेले सनकी को नेता में तब्दील कर दिया _END,8,12
71792,ted,the last time china had a quake of that magnitude,START_ पिछली बार जब चीन में इस तीव्रता का भूकंप आया था _END,10,13
124920,ted,so we ran another group of babies,START_ तो हमने बच्चों के एक और समूह पर प्रयोग किया _END,7,12
99338,ted,that theyre going to face,START_ का उन्हें सामना करना पड़ेगा _END,5,7
37161,ted,and i think i was fourteen,START_ उस समय शायद मैं चौदह साल का रहा हूँगा । _END,6,12


### Split the data into train and test

In [31]:
X, y = lines['english_sentence'], lines['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((19819,), (4955,))

### Let us save this data

In [32]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')


In [33]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### Encoder-Decoder Architecture

In [34]:
latent_dim=300

In [35]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [36]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [37]:
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 300)            4209300   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 300)            5262300   ['input_2[0][0]']             
                                                                                              

In [38]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [39]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 300)            4209300   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 300)            5262300   ['input_2[0][0]']             
                                                                                              

In [40]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 100

In [41]:
# model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
#                     steps_per_epoch = train_samples//batch_size,
#                     epochs=epochs,
#                     validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
#                     validation_steps = val_samples//batch_size)

In [42]:
# model.save_weights('nmt_weights.h5')
model.load_weights('nmt_weights.h5')


In [43]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)


In [44]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [45]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)

k=-1


In [46]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: now the reason these microentrepreneurs
Actual Hindi Translation:  अब वह कारण जिससे यह छोटे उद्यमी 
Predicted Hindi Translation:  अब वह कारण क्या है यह छोटे उद्यमी 


In [47]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: that we were seeing a shift in zeitgeist
Actual Hindi Translation:  और ये कि हम सामाजिक चेतना को जागृत कर रहे हैं 
Predicted Hindi Translation:  और ये कि हम सामाजिक चेतना को जागृत कर रहे हैं 


In [48]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: getting out of poverty
Actual Hindi Translation:  गरीबी से बाहर आना। 
Predicted Hindi Translation:  गरीबी से बाहर आना। 


In [49]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: and this woman right here um ahmad
Actual Hindi Translation:  और उम अहमद नाम की इस महिला 
Predicted Hindi Translation:  और फ़िर इस महिला को पूरा कर जाए 


In [51]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

# Initialize lists to store actual and predicted translations
actual_translations = []
predicted_translations = []
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=0

for k in range(20):  
    print(f"Sentence no. {k}")
    (input_seq, actual_output), _ = next(train_gen)
    decoded_sentence = decode_sequence(input_seq)
    actual_translations.append(y_train[k:k+1].values[0][6:-4])  # Actual translation without START_ and _END
    predicted_translations.append(decoded_sentence[:-4])  # Predicted translation without trailing _END
    print(f"English Sentence: {X_train[k:k+1].values[0]}  \nActual Translation: {y_train[k:k+1].values[0][6:-4]} \nPredicted Translation: {decoded_sentence[:-4]}")
    k=k+1
   
bleu_score = corpus_bleu([[reference.split()] for reference in actual_translations],
                         [candidate.split() for candidate in predicted_translations])

# Print the BLEU score
print('BLEU Score:', bleu_score)


Sentence no. 0
English Sentence: now the reason these microentrepreneurs  
Actual Translation:  अब वह कारण जिससे यह छोटे उद्यमी  
Predicted Translation:  अब वह कारण क्या है यह छोटे उद्यमी 
Sentence no. 1
English Sentence: that we were seeing a shift in zeitgeist  
Actual Translation:  और ये कि हम सामाजिक चेतना को जागृत कर रहे हैं  
Predicted Translation:  और ये कि हम सामाजिक चेतना को जागृत कर रहे हैं 
Sentence no. 2
English Sentence: getting out of poverty  
Actual Translation:  गरीबी से बाहर आना।  
Predicted Translation:  गरीबी से बाहर आना। 
Sentence no. 3
English Sentence: and this woman right here um ahmad  
Actual Translation:  और उम अहमद नाम की इस महिला  
Predicted Translation:  और फ़िर इस महिला को पूरा कर जाए 
Sentence no. 4
English Sentence: and this is my drawing pen  
Actual Translation:  और ये मेरा पेन है  
Predicted Translation:  और यह मेरी कहानी अपनी कल्पना धन्यवाद के बारे में
Sentence no. 5
English Sentence: “aah im home” because we all know where home is  
Actual Translat

English Sentence: with my wife and kids  
Actual Translation:  रात्रि भोजन के लिए घर जाऊं  
Predicted Translation:  मेरे और बच्चों को 
Sentence no. 12
English Sentence: because i didnt get caught  
Actual Translation:  कि मैं पकड़ा नहीं गया  
Predicted Translation:  क्योंकि मैंने उन्हें नहीं पता था 
Sentence no. 13
English Sentence: that was detected by the radar stations of great britain  
Actual Translation:  जिसका पता लगा था ग्रेट ब्रिटेन के रडार स्टेसन के द्वारा  
Predicted Translation:  जिसका पता लगा था ग्रेट ब्रिटेन के रडार स्टेसन के द्
Sentence no. 14
English Sentence: when i had learned this phrase  
Actual Translation:  जब मैने यह वाक्यांश याद किया था  
Predicted Translation:  जब मैने यह वाक्यांश याद किया था 
Sentence no. 15
English Sentence: as a boy  
Actual Translation:  बचपन में  
Predicted Translation:  बचपन में 
Sentence no. 16
English Sentence: and it moves up with mao tsetung getting health  
Actual Translation:  और ये माओत्सेसंग के स्वास्थ के साथ ऊपर उठता है  
Predict