## Import Dataset

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("dataset/Hindi_English_Truncated_Corpus.csv", encoding='utf-8')
df = df[df['source'] == 'ted']

df.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,So there is some sort of justice,तो वहाँ न्याय है
23,ted,This changed slowly,धीरे धीरे ये सब बदला
26,ted,were being produced.,उत्पन्न नहीं कि जाती थी.
30,ted,"And you can see, this LED is going to glow.","और जैसा आप देख रहे है, ये एल.ई.डी. जल उठेगी।"
32,ted,to turn on the lights or to bring him a glass ...,"लाईट जलाने के लिए या उनके लिए पानी लाने के लिए,"
35,ted,Can you imagine saying that?,क्या आप ये कल्पना कर सकते है


In [5]:
print(df["english_sentence"][30])

And you can see, this LED is going to glow.


## Data Preparation, Data Transformation

In [6]:
len(df[df['english_sentence'].isna()])

0

In [7]:
len(df[df['hindi_sentence'].isna()])

0

### Here, Remove NaN

In [8]:
df.dropna(subset="english_sentence", inplace=True)

In [9]:
### Do convert into lowercase
df['english_sentence'] = df['english_sentence'].apply(lambda x: x.lower())
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.lower())

df.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"i'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,"and who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,so there is some sort of justice,तो वहाँ न्याय है
23,ted,this changed slowly,धीरे धीरे ये सब बदला
26,ted,were being produced.,उत्पन्न नहीं कि जाती थी.
30,ted,"and you can see, this led is going to glow.","और जैसा आप देख रहे है, ये एल.ई.डी. जल उठेगी।"
32,ted,to turn on the lights or to bring him a glass ...,"लाईट जलाने के लिए या उनके लिए पानी लाने के लिए,"
35,ted,can you imagine saying that?,क्या आप ये कल्पना कर सकते है


In [10]:
### Remove Single quote
import re

df['english_sentence'] = df['english_sentence'].apply(lambda x: re.sub("'", "", x))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("'", "", x))

df.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"id like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,ted,what we really mean is that theyre bad at not ...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,"and who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,so there is some sort of justice,तो वहाँ न्याय है
23,ted,this changed slowly,धीरे धीरे ये सब बदला
26,ted,were being produced.,उत्पन्न नहीं कि जाती थी.
30,ted,"and you can see, this led is going to glow.","और जैसा आप देख रहे है, ये एल.ई.डी. जल उठेगी।"
32,ted,to turn on the lights or to bring him a glass ...,"लाईट जलाने के लिए या उनके लिए पानी लाने के लिए,"
35,ted,can you imagine saying that?,क्या आप ये कल्पना कर सकते है


In [11]:
### Remove Special character
import string

print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [12]:
df['english_sentence'] = df['english_sentence'].apply(lambda x: ''.join(wd for wd in x if wd not in string.punctuation))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: ''.join(wd for wd in x if wd not in string.punctuation))

df.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करन...
1,ted,id like to tell you about one such child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी
3,ted,what we really mean is that theyre bad at not ...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,and who are we to say even that they are wrong,और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,so there is some sort of justice,तो वहाँ न्याय है
23,ted,this changed slowly,धीरे धीरे ये सब बदला
26,ted,were being produced,उत्पन्न नहीं कि जाती थी
30,ted,and you can see this led is going to glow,और जैसा आप देख रहे है ये एलईडी जल उठेगी।
32,ted,to turn on the lights or to bring him a glass ...,लाईट जलाने के लिए या उनके लिए पानी लाने के लिए
35,ted,can you imagine saying that,क्या आप ये कल्पना कर सकते है


In [13]:
### Remove Digits
df['english_sentence'] = df['english_sentence'].apply(lambda x: x.translate(str.maketrans('', '', string.digits)))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.translate(str.maketrans('', '', string.digits)))

df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

df.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करन...
1,ted,id like to tell you about one such child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी
3,ted,what we really mean is that theyre bad at not ...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,and who are we to say even that they are wrong,और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,so there is some sort of justice,तो वहाँ न्याय है
23,ted,this changed slowly,धीरे धीरे ये सब बदला
26,ted,were being produced,उत्पन्न नहीं कि जाती थी
30,ted,and you can see this led is going to glow,और जैसा आप देख रहे है ये एलईडी जल उठेगी।
32,ted,to turn on the lights or to bring him a glass ...,लाईट जलाने के लिए या उनके लिए पानी लाने के लिए
35,ted,can you imagine saying that,क्या आप ये कल्पना कर सकते है


In [14]:
### Remove extra spaces
df['english_sentence'] = df['english_sentence'].apply(lambda x: x.strip())
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.strip())

df.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करन...
1,ted,id like to tell you about one such child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी
3,ted,what we really mean is that theyre bad at not ...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,and who are we to say even that they are wrong,और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,so there is some sort of justice,तो वहाँ न्याय है
23,ted,this changed slowly,धीरे धीरे ये सब बदला
26,ted,were being produced,उत्पन्न नहीं कि जाती थी
30,ted,and you can see this led is going to glow,और जैसा आप देख रहे है ये एलईडी जल उठेगी।
32,ted,to turn on the lights or to bring him a glass ...,लाईट जलाने के लिए या उनके लिए पानी लाने के लिए
35,ted,can you imagine saying that,क्या आप ये कल्पना कर सकते है


In [15]:
### Add Start & End Tokens
#df['english_sentence'] = df['english_sentence'].apply(lambda x: 'SOF ' + x + ' EOF')
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: 'SOF ' + x + ' EOF')

df.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,SOF राजनीतिज्ञों के पास जो कार्य करना चाहिए वह...
1,ted,id like to tell you about one such child,SOF मई आपको ऐसे ही एक बच्चे के बारे में बताना ...
3,ted,what we really mean is that theyre bad at not ...,SOF हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे ...
7,ted,and who are we to say even that they are wrong,SOF और हम होते कौन हैं यह कहने भी वाले कि वे ग...
13,ted,so there is some sort of justice,SOF तो वहाँ न्याय है EOF
23,ted,this changed slowly,SOF धीरे धीरे ये सब बदला EOF
26,ted,were being produced,SOF उत्पन्न नहीं कि जाती थी EOF
30,ted,and you can see this led is going to glow,SOF और जैसा आप देख रहे है ये एलईडी जल उठेगी। EOF
32,ted,to turn on the lights or to bring him a glass ...,SOF लाईट जलाने के लिए या उनके लिए पानी लाने के...
35,ted,can you imagine saying that,SOF क्या आप ये कल्पना कर सकते है EOF


In [16]:
### Add Length of English Word, and Hindi word to df variable
from nltk import word_tokenize

df['len_english_sentence'] = df['english_sentence'].apply(lambda x: len(word_tokenize(x)))
df['len_hindi_sentence'] = df['hindi_sentence'].apply(lambda x: len(word_tokenize(x)))




In [17]:
df.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence,len_english_sentence,len_hindi_sentence
0,ted,politicians do not have permission to do what ...,SOF राजनीतिज्ञों के पास जो कार्य करना चाहिए वह...,12,15
1,ted,id like to tell you about one such child,SOF मई आपको ऐसे ही एक बच्चे के बारे में बताना ...,9,13
3,ted,what we really mean is that theyre bad at not ...,SOF हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे ...,12,13
7,ted,and who are we to say even that they are wrong,SOF और हम होते कौन हैं यह कहने भी वाले कि वे ग...,11,15
13,ted,so there is some sort of justice,SOF तो वहाँ न्याय है EOF,7,6
23,ted,this changed slowly,SOF धीरे धीरे ये सब बदला EOF,3,7
26,ted,were being produced,SOF उत्पन्न नहीं कि जाती थी EOF,3,7
30,ted,and you can see this led is going to glow,SOF और जैसा आप देख रहे है ये एलईडी जल उठेगी। EOF,10,12
32,ted,to turn on the lights or to bring him a glass ...,SOF लाईट जलाने के लिए या उनके लिए पानी लाने के...,13,13
35,ted,can you imagine saying that,SOF क्या आप ये कल्पना कर सकते है EOF,5,9


In [18]:
df[df['len_english_sentence'] > 20].shape

(4, 5)

In [19]:
df = df[df['len_english_sentence'] <= 20]
df = df[df['len_hindi_sentence'] <= 20]

In [20]:
print("Max length of English Sentence: ", max(df['len_english_sentence']))
print("Max length of Hindi Sentence: ", max(df['len_hindi_sentence']))

Max length of English Sentence:  20
Max length of Hindi Sentence:  20


In [21]:
df.shape

(39516, 5)

In [22]:
df['english_sentence'][0]

'politicians do not have permission to do what needs to be done'

In [23]:
allEngWord = set()
for engSent in df['english_sentence']:
    for word in engSent.split():
        if word not in allEngWord:
            allEngWord.add(word)
            
print(len(allEngWord))

17200


In [24]:
allHinWord = set()
for hinSent in df['hindi_sentence']:
    for word in hinSent.split():
        if word not in allHinWord:
            allHinWord.add(word)
            
print(len(allHinWord))

22043


In [25]:
engWordSorted = sorted(list(allEngWord))
hinWordSorted = sorted(list(allHinWord))

num_encoder_tokens = len(allEngWord)
num_decoder_tokens = len(allHinWord)

num_encoder_tokens, num_decoder_tokens

(17200, 22043)

In [26]:
max_length_src = max(df["len_english_sentence"])
max_length_tar = max(df["len_hindi_sentence"])

print("Max. length in English: ", max_length_src)
print("Max. length in Hindi: ", max_length_tar)

Max. length in English:  20
Max. length in Hindi:  20


In [27]:
num_decoder_tokens += 1 # For zero padding

In [28]:
english_token_index = dict(
    [(word, (i+1)) for i, word in enumerate(engWordSorted)]
)

print(english_token_index)



In [29]:
hindi_token_index = dict(
    [(word, (i+1)) for i, word in enumerate(hinWordSorted)]
)

In [30]:
reverse_english_token_index = dict(
    (i, word) for word, i in english_token_index.items()
)

reverse_hindi_token_index = dict(
    (i, word) for word, i in hindi_token_index.items()
)

In [31]:
df.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence,len_english_sentence,len_hindi_sentence
0,ted,politicians do not have permission to do what ...,SOF राजनीतिज्ञों के पास जो कार्य करना चाहिए वह...,12,15
1,ted,id like to tell you about one such child,SOF मई आपको ऐसे ही एक बच्चे के बारे में बताना ...,9,13
3,ted,what we really mean is that theyre bad at not ...,SOF हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे ...,12,13
7,ted,and who are we to say even that they are wrong,SOF और हम होते कौन हैं यह कहने भी वाले कि वे ग...,11,15
13,ted,so there is some sort of justice,SOF तो वहाँ न्याय है EOF,7,6
23,ted,this changed slowly,SOF धीरे धीरे ये सब बदला EOF,3,7
26,ted,were being produced,SOF उत्पन्न नहीं कि जाती थी EOF,3,7
30,ted,and you can see this led is going to glow,SOF और जैसा आप देख रहे है ये एलईडी जल उठेगी। EOF,10,12
32,ted,to turn on the lights or to bring him a glass ...,SOF लाईट जलाने के लिए या उनके लिए पानी लाने के...,13,13
35,ted,can you imagine saying that,SOF क्या आप ये कल्पना कर सकते है EOF,5,9


In [32]:
from sklearn.model_selection import train_test_split

x, y = df['english_sentence'], df['hindi_sentence']
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=42)

xTrain.shape, xTest.shape

((31612,), (7904,))

In [33]:
def generate_batch(x=xTrain, y=yTrain, batch_size=128):
    #print("Hello")
    while True:
        for j in range(0, len(x), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype="float32")
            decoder_input_data = np.zeros((batch_size, max_length_tar), dtype="float32")
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype="float32")
            
            for i, (input_text, target_text) in enumerate(zip(x[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = english_token_index[word] # encoder input seq
                    
                for t, word in enumerate(target_text.split()):
                    if t < len(target_text.split()) - 1:
                        decoder_input_data[i, t] = hindi_token_index[word] # decoder input seq
                        
                    if t > 0:
                        # decoder target sequence (one hot encoded)
                        # does not include the SOF_ token
                        # offset by one timestep
                        decoder_target_data[i, t-1, hindi_token_index[word]] = 1.
            
            #print([encoder_input_data, decoder_input_data], " : " , decoder_target_data)
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### Encoder-Decoder Architecture

In [34]:
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

2022-09-26 13:09:22.285620: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-26 13:09:25.698961: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-26 13:09:33.170816: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-26 13:09:33.171199: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or dire

In [35]:
latent_dim = 300

In [36]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# We discard 'encoder_outputs' and only keep the states.
encoder_states = [state_h, state_c]

2022-09-26 13:09:44.199660: E tensorflow/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-09-26 13:09:44.199765: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (cskushal18): /proc/driver/nvidia/version does not exist
2022-09-26 13:09:44.201075: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-26 13:09:44.575389: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 20640000 exceeds 10% of free system memory.
2022-09-26 13:09:44.704791: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 20640000 exceeds 10% of free system memory.
2022-09-26 13:09:44

In [37]:
# Set up the decoder, using 'encoder_states' as inital state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn 'encoder_input_data' & 'decoder_input_data' into 'decoder_target_data'
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

2022-09-26 13:10:01.829223: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 26452800 exceeds 10% of free system memory.
2022-09-26 13:10:01.872094: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 26452800 exceeds 10% of free system memory.


In [38]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

In [39]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 300)    5160000     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 300)    6613200     ['input_2[0][0]']                
                                                                                              

In [40]:
train_samples = len(xTrain)
test_samples = len(xTest)
batch_size = 128
epochs = 15

In [41]:
model.fit(xTrain, yTrain, batch_size=batch_size, epochs=epochs)

Epoch 1/15


ValueError: in user code:

    File "/home/cskushal18/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/home/cskushal18/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/cskushal18/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/home/cskushal18/.local/lib/python3.10/site-packages/keras/engine/training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "/home/cskushal18/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/cskushal18/.local/lib/python3.10/site-packages/keras/engine/input_spec.py", line 216, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 1) dtype=string>]
