<a href="https://www.kaggle.com/code/ainurrohmanbwx/english-to-hindi-machine-translation-bigru-att?scriptVersionId=145084310" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Introduction

In this project, I have developed a machine translation model designed to translate text from English to Hindi. The model utilizes a Bidirectional GRU (Gated Recurrent Unit) with an integrated Attention Mechanism, optimized using the Adam optimizer, and employs categorical crossentropy as the loss function. The primary goal of this model is to generate accurate and efficient translations between the two languages.

# Import Data

In [None]:
# Disable warning

import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=FutureWarning)

In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/english-to-hindi-parallel-dataset/newdata.csv')
df.head()

In [None]:
df.shape

# Data Preprocessing

In [None]:
df = df.sample(n=20000, random_state = 42)
# Filter rows with non-null English sentences
df = df[~pd.isnull(df['english_sentence'])]
# Remove duplicate rows
df.drop_duplicates(inplace=True)

In [None]:
# Convert English and Hindi sentences to lowercase
df['english_sentence'] = df['english_sentence'].apply(lambda x: x.lower())
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.lower()) 

In [None]:
import re
# Remove single quotes (apostrophes) from English and Hindi sentences
df['english_sentence'] = df['english_sentence'].apply(lambda x: re.sub("'", '', x))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [None]:
import string

exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
df['english_sentence'] = df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [None]:
from string import digits

remove_digits = str.maketrans('', '', digits)  # To remove digits from a sentence

df['english_sentence'] = df['english_sentence'].apply(lambda x: x.translate(remove_digits))  # Remove digits from English text
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.translate(remove_digits))  # Remove digits from Hindi text

df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))  # Remove other unwanted characters from Hindi sentences

df['english_sentence'] = df['english_sentence'].apply(lambda x: x.strip())  # Remove start and end whitespaces from English text
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.strip())  # Remove start and end whitespaces from Hindi text

df['english_sentence'] = df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))  # Remove multiple whitespaces from English text
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))  # Remove multiple whitespaces from Hindi text

df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: 'START_ ' + x + ' _END')  # Add start and end tokens to target sequences

In [None]:
# Extract unique English words from all English sentences in the DataFrame
all_eng_words = set()
for eng in df['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

# Extract unique Hindi words from all Hindi sentences in the DataFrame
all_hindi_words = set()
for hin in df['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

# Add a new column indicating the length of each English sentence in terms of words
df['length_eng_sentence'] = df['english_sentence'].apply(lambda x: len(x.split(" ")))

# Add a new column indicating the length of each Hindi sentence in terms of words
df['length_hin_sentence'] = df['hindi_sentence'].apply(lambda x: len(x.split(" ")))

In [None]:
from sklearn.utils import shuffle

df = df[df['length_eng_sentence'] <= 20] # Maximum length of English sentence
df = df[df['length_hin_sentence'] <= 20] # Maximum length of Hindi sentence

max_length_src = max(df['length_hin_sentence']) # Maximum length of Hindi sentence
max_length_tar = max(df['length_eng_sentence']) # Maximum length of English sentence

input_words = sorted(list(all_eng_words)) # Sorted list of unique English words
target_words = sorted(list(all_hindi_words)) # Sorted list of unique Hindi words
num_encoder_tokens = len(all_eng_words) # Number of unique English words
num_decoder_tokens = len(all_hindi_words) # Number of unique Hindi words

num_encoder_tokens += 1 # For zero padding
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)]) # Dictionary containing words and their index in the sorted list of English words
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)]) # Dictionary containing words and their index in the sorted list of Hindi words
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items()) # Dictionary containing index and corresponding word in the sorted list of English words
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items()) # Dictionary containing index and corresponding word in the sorted list of Hindi words
df = shuffle(df) # Shuffle the DataFrame rows

# Modelling

In [None]:
from sklearn.model_selection import train_test_split

X, y = df['english_sentence'], df['hindi_sentence'] # English and Hindi sentences
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) # Split the data into training and testing sets

X_train.to_pickle('/kaggle/working/X_train.pkl') # Save training data frame
X_test.to_pickle('/kaggle/working/X_test.pkl') # Save testing data frame

In [None]:
import numpy as np

def generate_batch(X=X_train, y=y_train, batch_size=128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar), dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j + batch_size], y[j:j + batch_size])):
                # Set encoder_input_data and decoder_input_data based on input and target sequences
                input_words = input_text.split()
                target_words = target_text.split()
                for t, word in enumerate(input_words):
                    if t < max_length_src:
                        encoder_input_data[i, t] = input_token_index[word]

                for t, word in enumerate(target_words):
                    if t < max_length_tar:
                        decoder_input_data[i, t] = target_token_index[word]
                    if t > 0:
                        # Offset by one timestep for decoder target data
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.

            yield ([encoder_input_data, decoder_input_data], decoder_target_data)

In [None]:
import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Bidirectional, GRU, Embedding, Dense, Concatenate, Attention

latent_dim = 300  # Latent dimensionality of the encoding space
encoder_inputs = Input(shape=(None,))  # Encoder input sequence
enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero=True)(encoder_inputs)  # Encoder embedding layer
encoder_gru = Bidirectional(GRU(latent_dim, return_sequences=True, return_state=True))  # Bidirectional GRU layer
encoder_outputs, forward_h, backward_h = encoder_gru(enc_emb)  # Encoder output and states

# Split the concatenated states into two separate states
encoder_states = [forward_h, backward_h]  # Encoder states

decoder_inputs = Input(shape=(None,))  # Decoder input sequence
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero=True)  # Decoder embedding layer
dec_emb = dec_emb_layer(decoder_inputs)  # Decoder embedding layer

decoder_gru = Bidirectional(GRU(latent_dim, return_sequences=True, return_state=True))  # Bidirectional GRU layer
decoder_outputs, _, _ = decoder_gru(dec_emb, initial_state=encoder_states)  # Decoder output and states

# Apply the Attention layer
attention_layer = Attention()([decoder_outputs, encoder_outputs])

# Concatenate attention output with decoder outputs
decoder_concat = Concatenate(axis=-1)([decoder_outputs, attention_layer])

# Add a Dense layer after attention
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)  # Model object
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['acc'])  # Compile the model
model.summary()  # Print model summary

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

train_samples = len(X_train)  # Number of training samples
val_samples = len(X_test)  # Number of validation samples
batch_size = 128  # Batch size
epochs = 100  # Number of epochs

# Define the EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5, restore_best_weights=True)

# Define the ModelCheckpoint callback with custom verbose text
model_checkpoint = ModelCheckpoint(
    '/kaggle/working/best_model.h5',
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1,  # Set verbose to 1 to display custom text
    verbose_text="Epoch {epoch}: val_accuracy improved from {old_val_accuracy:.5f} to {new_val_accuracy:.5f}, saving model to ./best_model/best_model.h5"
)

# Train the model with both early stopping and model checkpoint
history = model.fit_generator(generator=generate_batch(X_train, y_train, batch_size=batch_size),
                    steps_per_epoch=train_samples // batch_size,
                    epochs=epochs,
                    validation_data=generate_batch(X_test, y_test, batch_size=batch_size),
                    validation_steps=val_samples // batch_size,
                    callbacks=[early_stopping, model_checkpoint])

# Save the entire model, including architecture and weights, if needed
model.save('/kaggle/working/final_model.h5')

In [None]:
import matplotlib.pyplot as plt

# Plot training loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot training accuracy
plt.plot(history.history['acc'], label='Training Accuracy')
plt.plot(history.history['val_acc'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()