In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [2]:
data_file = r"C:\Users\krish\Downloads\archive (9)\hin.txt"  

english_sentences = []
hindi_sentences = []

with open(data_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t")  # Tab-separated values
        if len(parts) >= 2:
            english_sentences.append(parts[0])  # English text
            hindi_sentences.append(parts[1])  # Hindi text 

In [3]:
eng_tokenizer = Tokenizer(filters='', oov_token="<OOV>")
hin_tokenizer = Tokenizer(filters='', oov_token="<OOV>")

eng_tokenizer.fit_on_texts(english_sentences)
hin_tokenizer.fit_on_texts(hindi_sentences)

eng_vocab_size = len(eng_tokenizer.word_index) + 1
hin_vocab_size = len(hin_tokenizer.word_index) + 1

eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
hin_sequences = hin_tokenizer.texts_to_sequences(hindi_sentences)

max_eng_length = max(len(seq) for seq in eng_sequences)
max_hin_length = max(len(seq) for seq in hin_sequences)

eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_length, padding="post")
hin_padded = pad_sequences(hin_sequences, maxlen=max_hin_length, padding="post")

hin_padded_cat = np.array([to_categorical(seq, num_classes=hin_vocab_size) for seq in hin_padded])

In [4]:
latent_dim = 256

In [5]:
def build_encoder():
    encoder_inputs = Input(shape=(max_eng_length,))
    enc_emb = Embedding(eng_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
    encoder_lstm = LSTM(latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
    encoder_states = [state_h, state_c]
    
    return Model(encoder_inputs, encoder_states), encoder_inputs, encoder_states

In [6]:
def build_decoder(encoder_states):
    decoder_inputs = Input(shape=(max_hin_length,))
    
    # Embedding layer for decoder
    dec_emb_layer = Embedding(hin_vocab_size, latent_dim, mask_zero=True)
    dec_emb = dec_emb_layer(decoder_inputs)

    # LSTM layer
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, state_h, state_c = decoder_lstm(dec_emb, initial_state=encoder_states)

    # Dense output layer
    decoder_dense = Dense(hin_vocab_size, activation="softmax")
    decoder_outputs = decoder_dense(decoder_outputs)

    # Training model (outputs full sequence)
    decoder_model_train = Model([decoder_inputs] + encoder_states, decoder_outputs)

    # Inference model for decoding one step at a time
    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_hidden_state_input = Input(shape=(max_eng_length, latent_dim))  # Encoder output

    dec_emb_inf = dec_emb_layer(decoder_inputs)
    decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
        dec_emb_inf, initial_state=[decoder_state_input_h, decoder_state_input_c]
    )
    decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

    decoder_model_inf = Model(
        [decoder_inputs] + [decoder_state_input_h, decoder_state_input_c, decoder_hidden_state_input],
        [decoder_outputs_inf, state_h_inf, state_c_inf]
    )

    return decoder_model_train, decoder_model_inf, decoder_inputs


In [7]:
encoder_model, encoder_inputs, encoder_states = build_encoder()
decoder_model_train, decoder_model_inf, decoder_inputs = build_decoder(encoder_states)
model = Model([encoder_inputs, decoder_inputs], decoder_model_train([decoder_inputs] + encoder_states))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.fit([eng_padded, hin_padded], hin_padded_cat, batch_size=64, epochs=40, validation_split=0.1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x271db1eeb50>

In [8]:
model.save("eng_hin_translation.h5")
import pickle
with open("eng_tokenizer.pkl", "wb") as f:
    pickle.dump(eng_tokenizer, f)
with open("hin_tokenizer.pkl", "wb") as f:
    pickle.dump(hin_tokenizer, f)

  saving_api.save_model(


In [13]:
loss, accuracy = model.evaluate([eng_padded, hin_padded], hin_padded_cat)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Test Loss: 0.26686930656433105, Test Accuracy: 0.9733326435089111
