In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Attention, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

2024-08-30 15:43:01.941592: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-30 15:43:02.320417: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-30 15:43:02.416492: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-30 15:43:03.169841: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
latent_dim = 256
num_samples = 10000

In [3]:
# Step 1: Load and preprocess the data
# Load dataset (replace with actual path)
data_path = 'dataset/eng_to_hindi.xlsx'  # Replace with actual path
df= pd.read_excel(data_path)

In [4]:
# Randomly sample 10,000 rows
#df = DF.sample(n=10000)

# Select the first 10,000 rows
#df = DF.head(15000)

# Select the last 10,000 rows
#df = DF.tail(15000)

In [5]:
english_sentences = df['English words/sentences'].tolist()
hindi_sentences = df['Unnamed: 1'].tolist()


In [7]:
# Preprocess data

# Add start and end tokens to Hindi sentences
hindi_sentences = ['\t ' + sentence + ' \n' for sentence in hindi_sentences]

# Tokenization and sequence padding
num_words = 10000  # Maximum number of words to keep in tokenizer
max_sequence_length = 50  # Maximum sequence length for padding

In [8]:
# English tokenizer
english_tokenizer = Tokenizer(num_words=num_words, filters=' ')
english_tokenizer.fit_on_texts(english_sentences)
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
english_sequences_padded = pad_sequences(english_sequences, maxlen=max_sequence_length, padding='post')

In [9]:
# Hindi tokenizer
hindi_tokenizer = Tokenizer(num_words=num_words, filters='')
hindi_tokenizer.fit_on_texts(hindi_sentences)
hindi_sequences = hindi_tokenizer.texts_to_sequences(hindi_sentences)
hindi_sequences_padded = pad_sequences(hindi_sequences, maxlen=max_sequence_length, padding='post')

In [10]:
hindi_sequences_padded.shape

(175621, 50)

In [11]:
hindi_sequences_padded[0]

array([   1, 8954,    2,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [12]:
# Get the total number of unique tokens
num_encoder_tokens = len(english_tokenizer.word_index) + 1
num_decoder_tokens = len(hindi_tokenizer.word_index) + 1

# Define input and target data
encoder_input_data = english_sequences_padded
decoder_input_data = hindi_sequences_padded[:, :-1]
decoder_target_data = hindi_sequences_padded[:, 1:]

# Split data into training and validation sets
encoder_input_train, encoder_input_val, decoder_input_train, decoder_input_val, decoder_target_train, decoder_target_val = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.2
)

In [13]:
# Step 2: Define the model with bidirectional LSTM and attention

from tensorflow.keras.layers import Dot, Activation

# Encoder
latent_dim = 256
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
encoder_bi_lstm = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_bi_lstm(encoder_embedding)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_decoder_tokens, latent_dim * 2)(decoder_inputs)
decoder_lstm = LSTM(latent_dim * 2, return_sequences=True, return_state=True)
decoder_lstm_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Luong attention mechanism
score_dot_product = Dot(axes=[2, 2])([decoder_lstm_outputs, encoder_outputs])  # Calculate the dot-product score
attention_weights = Activation('softmax')(score_dot_product)  # Softmax normalization to get attention weights
context_vector = Dot(axes=[2, 1])([attention_weights, encoder_outputs])  # Context vector is a weighted sum of encoder outputs

# Concatenate context vector with decoder LSTM outputs
decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_lstm_outputs])

# Dense layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined_context)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary to see the structure
model.summary()


2024-08-30 15:44:10.959796: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9462 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:65:00.0, compute capability: 7.5


In [None]:
# Step 3: Train the model
batch_size = 64
epochs = 50

model.fit(
    [encoder_input_train, decoder_input_train],
    decoder_target_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([encoder_input_val, decoder_input_val], decoder_target_val)
)

Epoch 1/50


2024-08-30 15:45:07.315263: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m342s[0m 153ms/step - accuracy: 0.8532 - loss: 1.0776 - val_accuracy: 0.8724 - val_loss: 0.7989
Epoch 2/50
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m338s[0m 154ms/step - accuracy: 0.8785 - loss: 0.7575 - val_accuracy: 0.8854 - val_loss: 0.6869
Epoch 3/50
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 153ms/step - accuracy: 0.8920 - loss: 0.6501 - val_accuracy: 0.8951 - val_loss: 0.6113
Epoch 4/50
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 154ms/step - accuracy: 0.9014 - loss: 0.5741 - val_accuracy: 0.9051 - val_loss: 0.5473
Epoch 5/50
[1m  87/2196[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:05[0m 145ms/step - accuracy: 0.9080 - loss: 0.5266

In [18]:
# Step 4: Define inference models for prediction

# Encoder model for inference
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

# Decoder model for inference
decoder_state_input_h = Input(shape=(latent_dim * 2,))
decoder_state_input_c = Input(shape=(latent_dim * 2,))
decoder_hidden_state_input = Input(shape=(None, latent_dim * 2))

decoder_lstm_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=[decoder_state_input_h, decoder_state_input_c]
)

context_vector = attention([decoder_lstm_outputs, decoder_hidden_state_input])
decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_lstm_outputs])
decoder_outputs = decoder_dense(decoder_combined_context)

decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs] + [state_h, state_c]
)

In [19]:
print(hindi_tokenizer.word_index)

{'\t': 1, '\n': 2, 'नहीं': 3, 'मैं': 4, 'मुझे': 5, 'कि': 6, 'आप': 7, 'क्या': 8, 'है.': 9, 'में': 10, 'है।': 11, 'के': 12, 'से': 13, 'वह': 14, 'टॉम': 15, 'एक': 16, 'यह': 17, 'की': 18, 'है': 19, 'को': 20, 'कर': 21, 'पर': 22, 'का': 23, 'लिए': 24, 'है?': 25, 'मैंने': 26, 'बहुत': 27, 'हैं।': 28, 'हैं.': 29, 'हैं?': 30, 'आपको': 31, 'कुछ': 32, 'रहा': 33, 'हो': 34, 'ने': 35, 'हम': 36, 'कोई': 37, 'मेरे': 38, 'भी': 39, 'करना': 40, 'करने': 41, 'तुम्हें': 42, 'ऐसा': 43, 'और': 44, 'तुम': 45, 'था।': 46, 'रहे': 47, 'अपने': 48, 'पास': 49, 'लगता': 50, 'तो': 51, 'हूं.': 52, 'चाहता': 53, 'ही': 54, 'उसने': 55, 'आपके': 56, 'इस': 57, 'उसे': 58, 'था.': 59, 'समय': 60, 'बात': 61, 'हूं।': 62, 'सकता': 63, 'इसे': 64, 'पसंद': 65, 'वे': 66, 'मेरी': 67, 'सकते': 68, 'साथ': 69, 'जो': 70, 'अपनी': 71, 'आपने': 72, 'काम': 73, 'हूं': 74, 'हैं': 75, 'आपकी': 76, 'तक': 77, 'हमें': 78, 'गया': 79, 'बारे': 80, 'किसी': 81, 'उन्होंने': 82, 'क्यों': 83, 'पता': 84, 'अब': 85, 'मेरा': 86, 'था': 87, 'घर': 88, 'अपना': 89, 'कभी': 90, 'कर

In [26]:
# Step 5: Function for decoding sequences
def decode_sequence(input_seq):
    # Encode the input as state vectors
    encoder_output, h, c = encoder_model.predict(input_seq,verbose=0)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = hindi_tokenizer.word_index['\t']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [encoder_output, h, c],verbose=0)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = hindi_tokenizer.index_word[sampled_token_index]
        decoded_sentence += sampled_char

        # Add a space if the current character is not a space and the next character is not a punctuation
        if sampled_char not in (' ', '\n') and len(decoded_sentence) > 1 and not decoded_sentence[-2] in (' ', '\n'):
            decoded_sentence += ' '  
        # Exit condition: either hit max length or find stop character
        if (sampled_char == '\n' or len(decoded_sentence) > max_sequence_length):
            stop_condition = True

        # Update the target sequence (length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        h, c = [h, c]

    return decoded_sentence

In [None]:
 'I spoke slowly so that they could understand me.',
 'I started to learn French in junior high school.',
 "I still haven't gotten over what happened to me.",
 'I suggested that we bring the meeting to an end.',
 'I suggested that we bring the meeting to an end.',
 "I suppose you're already packed and ready to go.",
 "I suppose you're already packed and ready to go.",
 "I suppose you're already packed and ready to go.",
 "I suppose you're already packed and ready to go.",
 "I suppose you're already packed and ready to go.",

In [27]:
# Test the model
test_sentence = "I spoke slowly so that they could understand me."
test_sentence_seq = english_tokenizer.texts_to_sequences([test_sentence])
test_sentence_padded = pad_sequences(test_sentence_seq, maxlen=max_sequence_length, padding='post')
decoded_sentence = decode_sequence(test_sentence_padded)
print('Decoded Sentence:', decoded_sentence)

Decoded Sentence: मैं धीरे-धीरे बोला ताकि वे मेरी बात समझ सकें. 

