In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dakshina/dakshina_dataset_v1.0/README.md
/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/lexicons/mr.translit.sampled.test.tsv
/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/lexicons/mr.translit.sampled.train.tsv
/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/lexicons/mr.translit.sampled.dev.tsv
/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/native_script_wikipedia/mr.wiki-filt.train.text.sorted.tsv/wiki-filt.train.text.sorted.tsv
/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/native_script_wikipedia/mr.wiki-full.nonblock.sections.list.txt/wiki-full.nonblock.sections.list.txt
/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/native_script_wikipedia/mr.wiki-filt.valid.text.shuf.txt/wiki-filt.valid.text.shuf.txt
/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/native_script_wikipedia/mr.wiki-full.urls.tsv/wiki-full.urls.tsv
/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/native_script_wikipedia/mr.wiki-full.omit_pages.txt/wiki-full.omit_pages.txt
/kaggle/input/dakshina/dak

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, LSTM, SimpleRNN, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import os
import unicodedata
import string
import re

In [3]:
def load_data(path):
    with open(path, encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
    pairs = [line.split('\t') for line in lines if '\t' in line]
    source_texts = [pair[1] for pair in pairs]
    target_texts = [pair[0] for pair in pairs]
    return source_texts, target_texts

In [4]:
def preprocess_data(source_texts, target_texts):
    target_texts = ['\t' + text + '\n' for text in target_texts]

    source_vocab = sorted(set(''.join(source_texts)))
    target_vocab = sorted(set(''.join(target_texts)))

    source_token_index = {char: i+1 for i, char in enumerate(source_vocab)}
    target_token_index = {char: i+1 for i, char in enumerate(target_vocab)}

    max_encoder_seq_length = max(len(txt) for txt in source_texts)
    max_decoder_seq_length = max(len(txt) for txt in target_texts)

    encoder_input_data = [[source_token_index.get(c, 0) for c in text] for text in source_texts]
    decoder_input_data = [[target_token_index.get(c, 0) for c in text] for text in target_texts]
    decoder_target_data = [seq[1:] + [0] for seq in decoder_input_data]

    encoder_input_data = pad_sequences(encoder_input_data, maxlen=max_encoder_seq_length, padding='post')
    decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_decoder_seq_length, padding='post')
    decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_decoder_seq_length, padding='post')

    return (np.array(encoder_input_data), np.array(decoder_input_data), np.array(decoder_target_data),
            source_token_index, target_token_index, max_encoder_seq_length, max_decoder_seq_length,
            len(source_vocab)+1, len(target_vocab)+1)

In [5]:
def build_seq2seq_model(cell_type='LSTM', embedding_dim=64, hidden_dim=128, 
                        num_layers=1, input_vocab_size=100, target_vocab_size=100,
                        max_encoder_len=20, max_decoder_len=20):

    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    enc_emb = Embedding(input_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)

    encoder_output = enc_emb
    encoder_states = []
    for _ in range(num_layers):
        if cell_type == 'LSTM':
            encoder_output, state_h, state_c = LSTM(hidden_dim, return_state=True, return_sequences=False)(encoder_output)
            encoder_states = [state_h, state_c]
        elif cell_type == 'GRU':
            encoder_output, state_h = GRU(hidden_dim, return_state=True, return_sequences=False)(encoder_output)
            encoder_states = [state_h]
        else:
            encoder_output, state_h = SimpleRNN(hidden_dim, return_state=True, return_sequences=False)(encoder_output)
            encoder_states = [state_h]

    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    dec_emb = Embedding(target_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
    
    decoder_output = dec_emb
    for _ in range(num_layers):
        if cell_type == 'LSTM':
            decoder_output, _, _ = LSTM(hidden_dim, return_sequences=True, return_state=True)(decoder_output, initial_state=encoder_states)
        elif cell_type == 'GRU':
            decoder_output, _ = GRU(hidden_dim, return_sequences=True, return_state=True)(decoder_output, initial_state=encoder_states)
        else:
            decoder_output, _ = SimpleRNN(hidden_dim, return_sequences=True, return_state=True)(decoder_output, initial_state=encoder_states)

    decoder_dense = Dense(target_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_output)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

In [6]:
def evaluate_model(model, encoder_input_data, decoder_input_data, decoder_target_data):
    predictions = model.predict([encoder_input_data, decoder_input_data])
    pred_seq = np.argmax(predictions, axis=-1)
    actual_seq = decoder_target_data
    total = 0
    correct = 0
    for i in range(len(pred_seq)):
        for j in range(len(pred_seq[i])):
            if actual_seq[i][j] == 0:
                continue
            total += 1
            if pred_seq[i][j] == actual_seq[i][j]:
                correct += 1
    acc = correct / total
    print(f"Character-level Accuracy: {acc*100:.2f}%")
    return acc

In [7]:
def display_samples(model, encoder_input_data, decoder_input_data, source_index, target_index):
    reverse_source_index = {v: k for k, v in source_index.items()}
    reverse_target_index = {v: k for k, v in target_index.items()}
    predictions = model.predict([encoder_input_data, decoder_input_data])
    pred_seq = np.argmax(predictions, axis=-1)

    for i in range(5):
        source = ''.join([reverse_source_index.get(c, '') for c in encoder_input_data[i] if c > 0])
        pred = ''.join([reverse_target_index.get(c, '') for c in pred_seq[i] if c > 0])
        print(f"Input (Latin): {source}\nPredicted (Devanagari): {pred}\n")


In [10]:
def complete_exec():
    train_path = '/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv'
    dev_path = '/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv'
    test_path = '/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv'

    source_train, target_train = load_data(train_path)
    source_dev, target_dev = load_data(dev_path)
    source_test, target_test = load_data(test_path)

    source_texts = source_train + source_dev
    target_texts = target_train + target_dev

    data = preprocess_data(source_texts, target_texts)
    encoder_input, decoder_input, decoder_target, src_idx, tgt_idx, max_len_src, max_len_tgt, vocab_src, vocab_tgt = data

    test_data = preprocess_data(source_test, target_test)
    test_encoder_input, test_decoder_input, test_decoder_target, *_ = test_data
    results = {}

    for cell_type in ['SimpleRNN', 'LSTM']:
        print(f"\nTraining model with cell type: {cell_type}")
        model = build_seq2seq_model(cell_type=cell_type, embedding_dim=64, hidden_dim=128, num_layers=1,
                                    input_vocab_size=vocab_src, target_vocab_size=vocab_tgt,
                                    max_encoder_len=max_len_src, max_decoder_len=max_len_tgt)
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
        model.fit([encoder_input, decoder_input], decoder_target[..., np.newaxis], batch_size=64, epochs=10, verbose=1)

        acc = evaluate_model(model, test_encoder_input, test_decoder_input, test_decoder_target)
        results[cell_type] = acc

    best_model = max(results, key=results.get)
    print("\nModel Comparison:")
    for model_name, accuracy in results.items():
        print(f"{model_name}: {accuracy*100:.2f}%")

    print(f"\nBest Model: {best_model}")

    model = build_seq2seq_model(cell_type=best_model, embedding_dim=64, hidden_dim=128, num_layers=1,
                                input_vocab_size=vocab_src, target_vocab_size=vocab_tgt,
                                max_encoder_len=max_len_src, max_decoder_len=max_len_tgt)    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    print(model.summary())
    model.fit([encoder_input, decoder_input], decoder_target[..., np.newaxis], batch_size=64, epochs=10, verbose=1)

    evaluate_model(model, test_encoder_input, test_decoder_input, test_decoder_target)
    display_samples(model, test_encoder_input, test_decoder_input, src_idx, tgt_idx)

complete_exec()


Training model with cell type: SimpleRNN
Epoch 1/10


W0000 00:00:1745078584.410762      70 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m746/759[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - loss: 2.5209

W0000 00:00:1745078589.070390      71 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - loss: 2.5138
Epoch 2/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 1.5895
Epoch 3/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 1.2740
Epoch 4/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 1.0977
Epoch 5/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.9957
Epoch 6/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.9354
Epoch 7/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.8823
Epoch 8/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.8387
Epoch 9/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.8117
Epoch 10/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.7795

None
Epoch 1/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - loss: 2.6040
Epoch 2/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 1.0887
Epoch 3/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.7232
Epoch 4/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.5868
Epoch 5/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 0.5016
Epoch 6/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.4518
Epoch 7/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.4103
Epoch 8/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.3760
Epoch 9/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.3503
Epoch 10/10
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step

In [24]:
    train_path = '/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv'
    dev_path = '/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv'
    test_path = '/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv'

    source_train, target_train = load_data(train_path)
    source_dev, target_dev = load_data(dev_path)
    source_test, target_test = load_data(test_path)

    source_texts = source_train + source_dev
    target_texts = target_train + target_dev

    data = preprocess_data(source_texts, target_texts)
    encoder_input, decoder_input, decoder_target, src_idx, tgt_idx, max_len_src, max_len_tgt, vocab_src, vocab_tgt = data
    

In [27]:
vocab_src

27

In [23]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_data(path):
    with open(path, encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
    pairs = [line.split('\t') for line in lines if '\t' in line]
    source_texts = [pair[1] for pair in pairs]
    target_texts = [pair[0] for pair in pairs]
    return source_texts, target_texts

def prepare_data(source_texts, target_texts, max_len=20):
    source_vocab = sorted(set(''.join(source_texts)))
    target_vocab = sorted(set(''.join(target_texts)))

    source_token_index = {char: i+1 for i, char in enumerate(source_vocab)}
    target_token_index = {char: i+1 for i, char in enumerate(target_vocab)}

    encoder_input_data = [[source_token_index.get(c, 0) for c in text] for text in source_texts]
    encoder_input_data = pad_sequences(encoder_input_data, maxlen=max_len, padding='post')

    # Output: first character only
    target_output_data = [target_token_index.get(text[0], 0) if text else 0 for text in target_texts]

    return (
        np.array(encoder_input_data),
        np.array(target_output_data),
        source_token_index,
        target_token_index,
        len(source_token_index) + 1,
        len(target_token_index) + 1
    )

# 3. Load and preprocess
source_texts, target_texts = load_data('/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv')
X, y, source_token_index, target_token_index, input_vocab_size, output_vocab_size = prepare_data(source_texts, target_texts)

# 4. Build and train GRU model
model = Sequential([
    Embedding(input_vocab_size, 64, input_length=X.shape[1], mask_zero=True),
    GRU(128),
    Dense(output_vocab_size, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, batch_size=64, epochs=5)

# 5. Predict and display results
def predict_first_character(model, inputs, source_token_index, target_token_index):
    reverse_target_index = {v: k for k, v in target_token_index.items()}
    predictions = model.predict(inputs)
    predicted_indices = np.argmax(predictions, axis=1)

    for i in range(5):
        input_seq = inputs[i]
        input_text = ''.join([k for idx in input_seq if idx > 0 for k, v in source_token_index.items() if v == idx])
        predicted_char = reverse_target_index.get(predicted_indices[i], '?')
        print(f"Input : {input_text[0]}  output: {predicted_char}")

predict_first_character(model, X, source_token_index, target_token_index)


Epoch 1/5
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.3134 - loss: 2.5929
Epoch 2/5
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9130 - loss: 0.2880
Epoch 3/5
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9292 - loss: 0.2196
Epoch 4/5
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9343 - loss: 0.1935
Epoch 5/5
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9371 - loss: 0.1793
[1m1382/1382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
Input : a  output: ए
Input : a  output: अ
Input : u  output: य
Input : a  output: अ
Input : a  output: अ


In [28]:
def total_computations(N, T, m, k, V):
    computations = (2 * N * T * m) + (2 * T * 4 * k * (k + m)) + (T * k * V)
    return computations

N = 32  
T = 100  
m = 64  
k = 128  
V = 27 

total_ops = total_computations(N, T, m, k, V)
print(f"Total computations done by the network: {total_ops}")


Total computations done by the network: 20416000
