In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import numpy as np
import tensorflow as tf
import pandas as pd
import re

In [3]:
batch_size = 64
epochs = 100
latent_dim = 256
num_samples = 10000

In [4]:
data_path = 'dataset/eng_to_hindi.xlsx'

In [5]:

DF = pd.read_excel(data_path)


In [6]:
# Randomly sample 10,000 rows
#df = DF.sample(n=10000)

# Select the first 10,000 rows
df = DF.head(15000)

# Select the last 10,000 rows
#df = DF.tail(10000)



In [7]:
input_text = df['English words/sentences']
input_text= input_text.astype(str)
input_text= input_text.apply(lambda x: x.lower())
input_text= input_text.apply(lambda x: re.sub("[^A-Za-z\s]","",x))
input_text= input_text.apply(lambda x: x.replace("\s+"," ")) # used for replacing one or more consecutive whitespace characters in a string with a single space.
input_text

0                      hi
1                     run
2                     run
3                     who
4                     wow
               ...       
14995    i want to resign
14996    i want to resign
14997    i want to retire
14998    i want to retire
14999    i want to retire
Name: English words/sentences, Length: 15000, dtype: object

In [8]:
target_text= df['Unnamed: 1'].astype(str)
# Convert text to lowercase (though Hindi is case-insensitive)
target_text= target_text.apply(lambda x: x.lower())
# Removing one or more consecutive whitespace
target_text= target_text.apply(lambda x: x.replace("\s+"," "))
 # Remove punctuation and special characters
target_text = target_text.apply(lambda x:re.sub(r'[^\w\s\u0900-\u097F]', '', x))  # Unicode range for Devanagari script
# Remove numbers
target_text = target_text.apply(lambda x:re.sub(r'\d+', '', x))
# Remove english alphabets
target_text = target_text.apply(lambda x: re.sub(r'[a-zA-Z]', '', x))
target_text= "\t "+target_text+" \n"
target_text

0                           \t नमस्ते। \n
1                            \t दौड़ना \n
2                            \t दौड़ना \n
3                               \t कौन \n
4                          \t बहुत खूब \n
                       ...               
14995    \t मैं इस्तीफा देना चाहता हूं \n
14996    \t मैं इस्तीफा देना चाहता हूं \n
14997     \t मैं रिटायर होना चाहता हूं \n
14998     \t मैं रिटायर होना चाहता हूं \n
14999     \t मैं रिटायर होना चाहता हूं \n
Name: Unnamed: 1, Length: 15000, dtype: object

In [9]:
# Creating lists of input and target lines
input_characters = set()
target_characters = set()
input_texts=[]
target_texts=[]

for lines in input_text:
    input_texts.append(lines)
    for char in lines:
        if char not in input_characters:
            input_characters.add(char)
    
for lines in target_text:
    target_texts.append(lines)
    for char in lines:
        if char not in target_characters:
            target_characters.add(char)

In [10]:
target_characters

{'\t',
 '\n',
 ' ',
 'ँ',
 'ं',
 'ः',
 'अ',
 'आ',
 'इ',
 'ई',
 'उ',
 'ऊ',
 'ऋ',
 'ए',
 'ऐ',
 'ऑ',
 'ओ',
 'औ',
 'क',
 'ख',
 'ग',
 'घ',
 'च',
 'छ',
 'ज',
 'झ',
 'ञ',
 'ट',
 'ठ',
 'ड',
 'ढ',
 'ण',
 'त',
 'थ',
 'द',
 'ध',
 'न',
 'प',
 'फ',
 'ब',
 'भ',
 'म',
 'य',
 'र',
 'ल',
 'व',
 'श',
 'ष',
 'स',
 'ह',
 '़',
 'ा',
 'ि',
 'ी',
 'ु',
 'ू',
 'ृ',
 'ॅ',
 'े',
 'ै',
 'ॉ',
 'ो',
 'ौ',
 '्',
 '।'}

In [11]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_len = max([len(txt) for txt in input_texts])
max_decoder_seq_len = max([len(txt) for txt in target_texts])

In [12]:
print('num of samples:', len(input_texts))
print('num of unique input tokens:', num_encoder_tokens)
print('num of unique output tokens:', num_decoder_tokens)
print('max seq len for inputs:', max_encoder_seq_len)
print('max seq len for outputs:', max_decoder_seq_len)

num of samples: 15000
num of unique input tokens: 27
num of unique output tokens: 65
max seq len for inputs: 16
max seq len for outputs: 48


In [13]:
input_texts

['hi',
 'run',
 'run',
 'who',
 'wow',
 'fire',
 'help',
 'jump',
 'stop',
 'stop',
 'stop',
 'wait',
 'wait',
 'go on',
 'go on',
 'go on',
 'hello',
 'hello',
 'i see',
 'i try',
 'i won',
 'i won',
 'i won',
 'oh no',
 'attack',
 'attack',
 'cheers',
 'cheers',
 'cheers',
 'cheers',
 'get up',
 'go now',
 'go now',
 'go now',
 'got it',
 'got it',
 'got it',
 'got it',
 'got it',
 'hop in',
 'hop in',
 'hug me',
 'hug me',
 'i fell',
 'i fell',
 'i know',
 'i left',
 'i left',
 'i lied',
 'i lost',
 'i paid',
 'im ',
 'im ok',
 'im ok',
 'listen',
 'no way',
 'no way',
 'no way',
 'no way',
 'no way',
 'no way',
 'no way',
 'no way',
 'no way',
 'really',
 'really',
 'really',
 'thanks',
 'we try',
 'we won',
 'we won',
 'we won',
 'we won',
 'ask tom',
 'awesome',
 'be calm',
 'be calm',
 'be calm',
 'be cool',
 'be fair',
 'be fair',
 'be fair',
 'be fair',
 'be fair',
 'be fair',
 'be kind',
 'be nice',
 'be nice',
 'be nice',
 'be nice',
 'be nice',
 'be nice',
 'beat it',
 'cal

In [14]:
target_texts

['\t नमस्ते। \n',
 '\t दौड़ना \n',
 '\t दौड़ना \n',
 '\t कौन \n',
 '\t बहुत खूब \n',
 '\t आग \n',
 '\t मदद करना \n',
 '\t कूदना। \n',
 '\t रुकना \n',
 '\t रुकना \n',
 '\t रुकना \n',
 '\t इंतज़ार \n',
 '\t इंतज़ार \n',
 '\t जारी रखें। \n',
 '\t जारी रखें। \n',
 '\t जारी रखें। \n',
 '\t नमस्ते \n',
 '\t नमस्ते \n',
 '\t अच्छा ऐसा है। \n',
 '\t मैं कोशिश करता हूँ। \n',
 '\t मैं जीत गया \n',
 '\t मैं जीत गया \n',
 '\t मैं जीत गया \n',
 '\t अरे नहीं \n',
 '\t आक्रमण करना \n',
 '\t आक्रमण करना \n',
 '\t प्रोत्साहित करना \n',
 '\t प्रोत्साहित करना \n',
 '\t प्रोत्साहित करना \n',
 '\t प्रोत्साहित करना \n',
 '\t उठना। \n',
 '\t अब जाओ। \n',
 '\t अब जाओ। \n',
 '\t अब जाओ। \n',
 '\t समझ गया \n',
 '\t समझ गया \n',
 '\t समझ गया \n',
 '\t समझ गया \n',
 '\t समझ गया \n',
 '\t अंदर कूदो \n',
 '\t अंदर कूदो \n',
 '\t मुझे गले लगाओ। \n',
 '\t मुझे गले लगाओ। \n',
 '\t मैं गिर गया। \n',
 '\t मैं गिर गया। \n',
 '\t मुझे पता है। \n',
 '\t मैंने। \n',
 '\t मैंने। \n',
 '\t मैंने झूठ बोला \n',
 '\t मेरी हार हु

In [15]:
input_token_index = dict(
    [(char,i) for i, char in enumerate(input_characters)]
)
target_token_index = dict(
    [(char,i) for i, char in enumerate(target_characters)]
)

In [16]:
encoder_input_data = np.zeros(
    (len(input_texts),max_encoder_seq_len, num_encoder_tokens),
    dtype = 'float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_len, num_decoder_tokens),    
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_len, num_decoder_tokens),
    dtype = 'float32')

In [17]:
encoder_input_data.shape

(15000, 16, 27)

In [18]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1
    encoder_input_data[i, t+1:, input_token_index[' ']] = 1
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]]=1
        if t>0:
            decoder_target_data[i, t-1, target_token_index[char]]=1
    decoder_input_data[i,t+1:, target_token_index[' ']] = 1
    decoder_target_data[i, t:, target_token_index[' ']]=1

In [19]:
encoder_input_data[0].shape

(16, 27)

# Model

In [20]:
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, TimeDistributed, Attention

In [21]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

2024-08-23 12:36:45.808115: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9474 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:65:00.0, compute capability: 7.5


In [25]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs,_,_ = decoder_lstm(decoder_inputs, initial_state = encoder_states)

# Attention
attention = Attention()
context_vector = attention([decoder_outputs, encoder_outputs])

# Concatenate context vector and LSTM output
decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_outputs])

decoder_dense = Dense(num_decoder_tokens, activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [26]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [27]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [28]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, 
         batch_size = batch_size,
         epochs=epochs,
         validation_split=0.2)

Epoch 1/100


2024-08-23 12:37:56.810919: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.6699 - loss: 1.6824 - val_accuracy: 0.6649 - val_loss: 1.3524
Epoch 2/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7100 - loss: 1.1748 - val_accuracy: 0.7231 - val_loss: 1.0908
Epoch 3/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7665 - loss: 0.9178 - val_accuracy: 0.7534 - val_loss: 0.9305
Epoch 4/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7932 - loss: 0.7914 - val_accuracy: 0.7750 - val_loss: 0.8531
Epoch 5/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8069 - loss: 0.7238 - val_accuracy: 0.7829 - val_loss: 0.8000
Epoch 6/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8176 - loss: 0.6771 - val_accuracy: 0.7917 - val_loss: 0.7619
Epoch 7/100
[1m188/188[0m 

<keras.src.callbacks.history.History at 0x7bff0d2a6440>

In [29]:
encoder_model = Model(encoder_inputs,encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h,decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state= decoder_states_inputs)
decoder_states=[state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs]+decoder_states_inputs, [decoder_outputs]+decoder_states)


In [30]:
reverse_input_char_index= dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index= dict((i, char) for char, i in target_token_index.items())

In [34]:
def decode_sequence(input_seq):
    
    states_value = encoder_model.predict(input_seq, verbose=0)
    
    target_seq = np.zeros((1,1, num_decoder_tokens))
    
    target_seq[0, 0, target_token_index['\t']]=1.
    
    stop_condition= False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h,c = decoder_model.predict([target_seq]+states_value, verbose=0)
        sampled_token_index = np.argmax(output_tokens[0,-1,:])
       
        sampled_char = reverse_target_char_index[sampled_token_index]
        
        decoded_sentence += sampled_char
        if (sampled_char=='\n' or len(decoded_sentence)>max_decoder_seq_len):
            stop_condition=True

        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0,0, sampled_token_index]=1.
        
        states_value = [h, c]
    return decoded_sentence




In [36]:
for seq_index in range(500,600,1):
    input_seq = encoder_input_data[seq_index:seq_index+1]
    decode_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence  :', input_texts[seq_index])
    print('Decoded Sentence:', decode_sentence)

-
Input sentence  : catch tom
Decoded Sentence:  टॉम को पकड़ो 

-
Input sentence  : catch tom
Decoded Sentence:  टॉम को पकड़ो 

-
Input sentence  : catch him
Decoded Sentence:  इसे दीखों 

-
Input sentence  : chill out
Decoded Sentence:  मज़े करें। 

-
Input sentence  : come back
Decoded Sentence:  वापस आओ। 

-
Input sentence  : come back
Decoded Sentence:  वापस आओ। 

-
Input sentence  : come here
Decoded Sentence:  यहाँ आओ। 

-
Input sentence  : come here
Decoded Sentence:  यहाँ आओ। 

-
Input sentence  : come over
Decoded Sentence:  मिलने आना। 

-
Input sentence  : come over
Decoded Sentence:  मिलने आना। 

-
Input sentence  : come over
Decoded Sentence:  मिलने आना। 

-
Input sentence  : come over
Decoded Sentence:  मिलने आना। 

-
Input sentence  : come over
Decoded Sentence:  मिलने आना। 

-
Input sentence  : come over
Decoded Sentence:  मिलने आना। 

-
Input sentence  : come over
Decoded Sentence:  मिलने आना। 

-
Input sentence  : come soon
Decoded Sentence:  जल्दी आओ। 

-
Input senten