In [59]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, LSTM, Dense,Dropout
from tensorflow.keras.models import Model

In [60]:
import tensorflow as tf

# Load data (with Devanagari → Latin → freq columns)
def load_data(file_path, num_samples=None):
    input_texts = []
    target_texts = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if num_samples and i >= num_samples:
                break
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                latin_input = parts[1]  # Latin script is column 1
                devanagari_output = '\t' + parts[0] + '\n'  # Column 0; add start/end tokens
                input_texts.append(latin_input)
                target_texts.append(devanagari_output)
    return input_texts, target_texts

# Tokenize character-wise
def tokenize_char(sequences):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(sequences)
    sequences_tensor = tokenizer.texts_to_sequences(sequences)
    return sequences_tensor, tokenizer

# Pad sequences
def pad_sequences(sequences, maxlen=None):
    return tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen, padding='post')

# Full pipeline
def preprocess_dataset(file_path, num_samples=None):
    input_texts, target_texts = load_data(file_path, num_samples)

    input_tensor_raw, inp_tokenizer = tokenize_char(input_texts)
    target_tensor_raw, targ_tokenizer = tokenize_char(target_texts)

    max_input_len = max(len(seq) for seq in input_tensor_raw)
    max_target_len = max(len(seq) for seq in target_tensor_raw)

    input_tensor = pad_sequences(input_tensor_raw, max_input_len)
    target_tensor = pad_sequences(target_tensor_raw, max_target_len)

    return (input_tensor, target_tensor,
            inp_tokenizer, targ_tokenizer,
            max_input_len, max_target_len)


In [61]:
file_path = '/content/hi.translit.sampled.train.tsv'

input_tensor, target_tensor, inp_tokenizer, targ_tokenizer, max_in_len, max_tar_len = preprocess_dataset(
    file_path, num_samples=10000)

print(f"Input shape: {input_tensor.shape}")
print(f"Target shape: {target_tensor.shape}")
print(f"Input vocab size: {len(inp_tokenizer.word_index) + 1}")
print(f"Target vocab size: {len(targ_tokenizer.word_index) + 1}")

# View a sample
# idx = 0
# decoded_input = ''.join(inp_tokenizer.index_word.get(i, '') for i in input_tensor[idx] if i != 0)
# decoded_target = ''.join(targ_tokenizer.index_word.get(i, '') for i in target_tensor[idx] if i != 0)

# print(f"Latin Input     : {decoded_input}")
# print(f"Devanagari Target: {decoded_target}")

for idx in range(10):  # Change range to view more or fewer samples
    decoded_input = ''.join(inp_tokenizer.index_word.get(i, '') for i in input_tensor[idx] if i != 0)
    decoded_target = ''.join(targ_tokenizer.index_word.get(i, '') for i in target_tensor[idx] if i != 0)

    print(f"Latin Input     : {decoded_input}")
    print(f"Devanagari Target: {decoded_target}")
    print('-' * 40)


Input shape: (10000, 18)
Target shape: (10000, 20)
Input vocab size: 27
Target vocab size: 64
Latin Input     : an
Devanagari Target: 	अं

----------------------------------------
Latin Input     : ankganit
Devanagari Target: 	अंकगणित

----------------------------------------
Latin Input     : uncle
Devanagari Target: 	अंकल

----------------------------------------
Latin Input     : ankur
Devanagari Target: 	अंकुर

----------------------------------------
Latin Input     : ankuran
Devanagari Target: 	अंकुरण

----------------------------------------
Latin Input     : ankurit
Devanagari Target: 	अंकुरित

----------------------------------------
Latin Input     : aankush
Devanagari Target: 	अंकुश

----------------------------------------
Latin Input     : ankush
Devanagari Target: 	अंकुश

----------------------------------------
Latin Input     : ang
Devanagari Target: 	अंग

----------------------------------------
Latin Input     : anga
Devanagari Target: 	अंग

--------------------------

In [62]:
# File paths for train, validation, and test datasets
train_file = "/content/hi.translit.sampled.train.tsv"
val_file = "/content/hi.translit.sampled.dev.tsv"
test_file = "/content/hi.translit.sampled.test.tsv"

# Load and preprocess training, validation, and test data
train_input, train_target, inp_tokenizer, targ_tokenizer, _, _ = preprocess_dataset(train_file, num_samples=10000)
val_input, val_target, _, _, _, _ = preprocess_dataset(val_file, num_samples=1000)
test_input, test_target, _, _, _, _ = preprocess_dataset(test_file, num_samples=1000)


In [63]:
def create_one_hot_data(input_tensor, target_tensor, num_encoder_tokens, num_decoder_tokens):
    num_samples = len(input_tensor)
    max_input_len = input_tensor.shape[1]
    max_target_len = target_tensor.shape[1]

    encoder_input_data = np.zeros((num_samples, max_input_len, num_encoder_tokens), dtype="float32")
    decoder_input_data = np.zeros((num_samples, max_target_len, num_decoder_tokens), dtype="float32")
    decoder_target_data = np.zeros((num_samples, max_target_len, num_decoder_tokens), dtype="float32")

    for i in range(num_samples):
        for t in range(max_input_len):
            encoder_input_data[i, t, input_tensor[i, t]] = 1.0
        for t in range(max_target_len):
            decoder_input_data[i, t, target_tensor[i, t]] = 1.0
            if t > 0:
                decoder_target_data[i, t - 1, target_tensor[i, t]] = 1.0

    return encoder_input_data, decoder_input_data, decoder_target_data


In [64]:
# Create one-hot encoded data for training, validation, and test sets
train_encoder, train_decoder_in, train_decoder_out = create_one_hot_data(
    train_input, train_target, len(inp_tokenizer.word_index)+1, len(targ_tokenizer.word_index)+1
)

val_encoder, val_decoder_in, val_decoder_out = create_one_hot_data(
    val_input, val_target, len(inp_tokenizer.word_index)+1, len(targ_tokenizer.word_index)+1
)

test_encoder_input, test_decoder_in, test_decoder_out = create_one_hot_data(
    test_input, test_target, len(inp_tokenizer.word_index)+1, len(targ_tokenizer.word_index)+1
)



In [65]:
# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.LSTM(latent_dim, return_state=True,dropout=0.3)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.3)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [66]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [train_encoder, train_decoder_in], train_decoder_out,
    batch_size=64,
    epochs=100,
    validation_split=0.2,
)
# Save model
model.save("s2s_model.keras")

Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.6186 - loss: 1.8596 - val_accuracy: 0.6626 - val_loss: 1.3850
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.6773 - loss: 1.2859 - val_accuracy: 0.6940 - val_loss: 1.2804
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.6917 - loss: 1.1960 - val_accuracy: 0.6946 - val_loss: 1.2615
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6984 - loss: 1.1567 - val_accuracy: 0.7003 - val_loss: 1.2911
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.7067 - loss: 1.1169 - val_accuracy: 0.7081 - val_loss: 1.2756
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7125 - loss: 1.0803 - val_accuracy: 0.7058 - val_loss: 1.2511
Epoch 7/100
[1m125

In [68]:
model = keras.models.load_model("s2s_model.keras")

encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,))
decoder_state_input_c = keras.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in inp_tokenizer.word_index.items())
reverse_target_char_index = dict((i, char) for char, i in targ_tokenizer.word_index.items())

target_token_index = targ_tokenizer.word_index
max_decoder_seq_length = max_tar_len

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 0:
            sampled_char = ''  # Or any placeholder you prefer, like ' '
        else:
            sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [69]:
for seq_index in range(5):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = train_encoder[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    input_text = ''.join([reverse_input_char_index.get(idx, '') for idx in train_input[seq_index] if idx != 0])
    print("Input sentence:", input_text)
    print("Decoded sentence:", decoded_sentence)

-
Input sentence: an
Decoded sentence: एन

-
Input sentence: ankganit
Decoded sentence: अंगग्ित

-
Input sentence: uncle
Decoded sentence: एर्क

-
Input sentence: ankur
Decoded sentence: अंकर

-
Input sentence: ankuran
Decoded sentence: अनुदानों



In [71]:
for seq_index in range(10):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = test_encoder_input[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    input_text = ''.join([reverse_input_char_index.get(idx, '') for idx in test_input[seq_index] if idx != 0])
    print("Input sentence:", input_text)
    print("Decoded sentence:", decoded_sentence)

-
Input sentence: ank
Decoded sentence: अन्

-
Input sentence: anka
Decoded sentence: अंका

-
Input sentence: ankir
Decoded sentence: अनिर्र

-
Input sentence: anaksn
Decoded sentence: अन्नान

-
Input sentence: anktsn
Decoded sentence: अंक्तें

-
Input sentence: anksn
Decoded sentence: अंक्

-
Input sentence: anpksh
Decoded sentence: अप्काष

-
Input sentence: anksh
Decoded sentence: आकक्ष

-
Input sentence: anpaahak
Decoded sentence: अन्पाहही

-
Input sentence: anpahak
Decoded sentence: अन्याकक

