In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
for root, dirs, files in os.walk('/content/drive/MyDrive'):
    for file in files:
        if file.endswith('.zip'):
            print(os.path.join(root, file))


/content/drive/MyDrive/train (2).zip
/content/drive/MyDrive/test (2).zip
/content/drive/MyDrive/valid (2).zip
/content/drive/MyDrive/recognition (2).zip
/content/drive/MyDrive/archive (2).zip
/content/drive/MyDrive/train_data/train.zip
/content/drive/MyDrive/valid_data/valid.zip
/content/drive/MyDrive/test_data/test.zip


In [5]:
from zipfile import ZipFile

zip_path = '/content/drive/MyDrive/archive (2).zip'  # Replace with correct path
extract_to = '/content/drive/MyDrive/extracted_data'
os.makedirs(extract_to, exist_ok=True)

with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Extracted files:", os.listdir(extract_to))


Extracted files: ['hindi_english_parallel.csv']


#  Load the CSV Data with Pandas


In [6]:
import pandas as pd

csv_path = '/content/drive/MyDrive/extracted_data/hindi_english_parallel.csv'
df = pd.read_csv(csv_path)
df = df.sample(n=1000, random_state=42)  # Use 2000 or lower if RAM is an issue
print(df.head())


                                                     hindi  \
957248   बडे पैमाने पर सुनामी से प्रभावीत जापान में 4 द...   
1072034                           वर्ग का पूर्णा क्या था?    
1195844                         मैं अपना काम कर चुका हूँ।    
1123517                  राष्ट्रीय मनः स्वास्थ्य कार्यक्रम   
933515                                           क्रियावली   

                                                   english  
957248   4 days after the massive tsunami struck Japan,...  
1072034                    What was completing the square?  
1195844                       I have already done my work.  
1123517                   National Mental Health Programme  
933515                                                menu  


#  Data Cleaning & Preparation

In [7]:
import re

def clean_text(text):
    text = str(text).strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s<>]', '', text)
    return text

input_texts = df['hindi'].apply(clean_text).tolist()
target_texts = df['english'].apply(clean_text).tolist()
target_texts = ['<start> ' + t + ' <end>' for t in target_texts]

print(input_texts[:3])
print(target_texts[:3])


['बड पमन पर सनम स परभवत जपन म 4 दन बद कई अभ तक जद हन क आशए लपत ह रह थ', 'वरग क परण कय थ', 'म अपन कम कर चक ह']
['<start> 4 days after the massive tsunami struck Japan hopes of finding anyone still alive were fading <end>', '<start> What was completing the square <end>', '<start> I have already done my work <end>']


# filter by length

In [8]:
min_len, max_len = 3, 50
filtered = [(inp, tgt) for inp, tgt in zip(input_texts, target_texts)
            if min_len <= len(inp.split()) <= max_len and min_len <= len(tgt.split()) <= max_len]
input_texts, target_texts = zip(*filtered)


#  Tokenization and Padding


In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(input_texts)
input_seqs = input_tokenizer.texts_to_sequences(input_texts)

target_tokenizer = Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_texts)
target_seqs = target_tokenizer.texts_to_sequences(target_texts)

max_input_len = max(len(seq) for seq in input_seqs)
max_target_len = max(len(seq) for seq in target_seqs)
input_seqs = pad_sequences(input_seqs, maxlen=max_input_len, padding='post')
target_seqs = pad_sequences(target_seqs, maxlen=max_target_len, padding='post')

print(f'Input sequences shape: {input_seqs.shape}')
print(f'Target sequences shape: {target_seqs.shape}')


Input sequences shape: (652, 50)
Target sequences shape: (652, 50)


In [10]:
import tensorflow as tf
from tensorflow.keras import layers


# encoder with LSTM

In [11]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.lstm = layers.LSTM(enc_units, return_sequences=True, return_state=True)

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state_h, state_c = self.lstm(x, initial_state=hidden)
        return output, state_h, state_c

    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]


# Bahdanau Attention

In [12]:
class BahdanauAttention(layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = layers.Dense(units)
        self.W2 = layers.Dense(units)
        self.V = layers.Dense(1)

    def call(self, query, values):
        # query: Decoder hidden state (batch, units)
        # values: Encoder output (batch, seq_len, units)
        query_with_time_axis = tf.expand_dims(query, 1)  # (batch, 1, units)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)  # (batch, seq_len, 1)
        context_vector = attention_weights * values  # (batch, seq_len, units)
        context_vector = tf.reduce_sum(context_vector, axis=1)  # (batch, units)
        return context_vector, attention_weights


# decoder

In [13]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.lstm = layers.LSTM(dec_units, return_sequences=True, return_state=True)
        self.fc = layers.Dense(vocab_size)
        self.attention = BahdanauAttention(dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden[0], enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state_h, state_c = self.lstm(x, initial_state=hidden)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state_h, state_c, attention_weights


# Build Model Utility

In [14]:
def build_model(input_vocab_size, target_vocab_size, embedding_dim=256, units=512, batch_size=64):
    encoder = Encoder(input_vocab_size, embedding_dim, units, batch_size)
    decoder = Decoder(target_vocab_size, embedding_dim, units, batch_size)
    return encoder, decoder


In [15]:
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1


# Prepare Batches for Training

In [16]:
import tensorflow as tf

BATCH_SIZE = 64
BUFFER_SIZE = len(input_seqs)  # usually dataset size

dataset = tf.data.Dataset.from_tensor_slices((input_seqs, target_seqs))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
steps_per_epoch = len(input_seqs) // BATCH_SIZE

print(f"Steps per epoch: {steps_per_epoch}")


Steps per epoch: 10


# loss function

In [17]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)


# Training Step Function

# Training Loop


In [18]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden_h, enc_hidden_c = encoder(inp, enc_hidden)
        dec_hidden = [enc_hidden_h, enc_hidden_c]
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * inp.shape[0], 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden_h, dec_hidden_c, _ = decoder(dec_input, dec_hidden, enc_output)
            dec_hidden = [dec_hidden_h, dec_hidden_c]
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = loss / int(targ.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss


In [19]:
encoder, decoder = build_model(input_vocab_size, target_vocab_size, embedding_dim=256, units=512, batch_size=BATCH_SIZE)


In [20]:
optimizer = tf.keras.optimizers.Adam()


In [21]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)


In [22]:
EPOCHS = 2

for epoch in range(EPOCHS):
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 10 == 0:
            print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    # Save checkpoint every epoch
    checkpoint.save(file_prefix=checkpoint_prefix)
    print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')


Epoch 1 Batch 0 Loss 2.7001
Epoch 1 Loss 2.2318
Epoch 2 Batch 0 Loss 1.6724
Epoch 2 Loss 2.0296


# Translate Function

In [23]:
def translate(sentence, encoder, decoder, input_tokenizer, target_tokenizer, max_length=None):
    if max_length is None:
        max_length = max(len(seq) for seq in input_seqs)
    # Tokenize and pad input sentence
    seq = input_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_length, padding='post')
    seq = tf.convert_to_tensor(seq)

    # Encode
    result = ''
    # Use a batch size of 1 for inference
    hidden = [tf.zeros((1, encoder.enc_units)), tf.zeros((1, encoder.enc_units))]
    enc_out, enc_hidden_h, enc_hidden_c = encoder(seq, hidden)
    dec_hidden = [enc_hidden_h, enc_hidden_c]
    dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)

    for i in range(max_length):
        predictions, dec_hidden_h, dec_hidden_c, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        dec_hidden = [dec_hidden_h, dec_hidden_c]
        predicted_id = tf.argmax(predictions[0]).numpy()
        word = target_tokenizer.index_word.get(predicted_id, '')
        if word == '<end>':
            break
        result += word + ' '
        dec_input = tf.expand_dims([predicted_id], 0)
        # dec_hidden = [dec_hidden_h, dec_hidden_c] # This line is redundant

    return result.strip()

# Test Example Translations

In [41]:
def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden[0], enc_output)
    x = self.embedding(x)  # (batch, 1, embedding_dim)
    context_vector = tf.expand_dims(context_vector, 1)  # (batch, 1, units)
    x = tf.concat([context_vector, x], axis=-1)  # (batch, 1, embedding_dim + units)
    output, state_h, state_c = self.lstm(x, initial_state=hidden)
    output = tf.reshape(output, (-1, output.shape[2]))  # (batch, units)
    x = self.fc(output)  # (batch, vocab)
    return x, state_h, state_c, attention_weights


In [43]:
# Re-create models for inference
embedding_dim = 256   # use your training setting
units = 512           # use your training setting

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

inference_encoder = Encoder(input_vocab_size, embedding_dim, units, batch_sz=1)
inference_decoder = Decoder(target_vocab_size, embedding_dim, units, batch_sz=1)

# Restore weights if you saved checkpoints:
# checkpoint = tf.train.Checkpoint(encoder=inference_encoder, decoder=inference_decoder)
# checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))


In [44]:
test_sentence = input_texts[0]
max_length = input_seqs.shape[1]

seq = input_tokenizer.texts_to_sequences([test_sentence])
seq = pad_sequences(seq, maxlen=max_length, padding='post')
seq = tf.convert_to_tensor(seq)

enc_hidden = inference_encoder.initialize_hidden_state()  # batch=1
enc_out, enc_hidden_h, enc_hidden_c = inference_encoder(seq, enc_hidden)
dec_hidden = [enc_hidden_h, enc_hidden_c]
dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)

output_sentence = ""
for t in range(max_length):
    predictions, dec_hidden_h, dec_hidden_c, attention_weights = inference_decoder(dec_input, dec_hidden, enc_out)
    predicted_id = tf.argmax(predictions[0]).numpy().item()
    word = target_tokenizer.index_word.get(predicted_id, "")
    if word == '<end>':
        break
    output_sentence += word + " "
    dec_input = tf.expand_dims([predicted_id], 0)
    dec_hidden = [dec_hidden_h, dec_hidden_c]

print("Hindi:", test_sentence)
print("Predicted English:", output_sentence.strip())


Hindi: बड पमन पर सनम स परभवत जपन म 4 दन बद कई अभ तक जद हन क आशए लपत ह रह थ
Predicted English: negative drum occupying drum mexico smooth recline recline recline deprive nutrients nutrients entrusted nutrients entrusted inhabiting inhabiting inhabiting inhabiting safeguards hell safeguards cheap year form capable unjust families entrepreneurship consider farm farm true tar tar tar female tar female tar female independence tar betray anti read read insolence root utilize


In [45]:
def translate(sentence):
    seq = input_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_length, padding='post')
    seq = tf.convert_to_tensor(seq)
    enc_hidden = inference_encoder.initialize_hidden_state()
    enc_out, enc_hidden_h, enc_hidden_c = inference_encoder(seq, enc_hidden)
    dec_hidden = [enc_hidden_h, enc_hidden_c]
    dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    output_sentence = ""
    for t in range(max_length):
        predictions, dec_hidden_h, dec_hidden_c, attention_weights = inference_decoder(dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy().item()
        word = target_tokenizer.index_word.get(predicted_id, "")
        if word == '<end>':
            break
        output_sentence += word + " "
        dec_input = tf.expand_dims([predicted_id], 0)
        dec_hidden = [dec_hidden_h, dec_hidden_c]
    return output_sentence.strip()


In [46]:
# Test translation on a few validation/test sample sentences
for idx in range(5):
    hnd = input_texts[idx]
    print('Hindi:', hnd)
    print('Predicted English:', translate(hnd))
    print('-'*40)


Hindi: बड पमन पर सनम स परभवत जपन म 4 दन बद कई अभ तक जद हन क आशए लपत ह रह थ
Predicted English: negative drum occupying drum mexico smooth recline recline recline deprive nutrients nutrients entrusted nutrients entrusted inhabiting inhabiting inhabiting inhabiting safeguards hell safeguards cheap year form capable unjust families entrepreneurship consider farm farm true tar tar tar female tar female tar female independence tar betray anti read read insolence root utilize
----------------------------------------
Hindi: वरग क परण कय थ
Predicted English: negative bring bring gorget gorget raga consider research vidya consider vaishnava netaji netaji comes vichitrita junk gorget gorget gorget junput terms junk gorget gorget gorget junput terms pratahkal junk gorget gorget gorget junput terms pratahkal junk gorget gorget gorget junput terms pratahkal junk gorget gorget gorget junput terms pratahkal junk
----------------------------------------
Hindi: म अपन कम कर चक ह
Predicted English: negati

In [49]:
while True:
    hnd = input("Enter a Hindi sentence (or 'exit'): ")
    if hnd.lower() == 'exit':
        break
    print('Predicted English:', translate(hnd))
    print('-'*40)


Enter a Hindi sentence (or 'exit'): रषटरय मन सवसथय करयकरम
Predicted English: negative bring bring gorget gorget gorget raga consider consider vaishnava netaji vichitrita vaishnava senior owned smiling smiling smiling vidya vidya consider vidya consider vaishnava consider vaishnava netaji netaji 1932 1932 netaji semblance semblance transform off 1932 1932 1932 1932 semblance starts setting off marriage college s s s s s
----------------------------------------


KeyboardInterrupt: Interrupted by user