In [95]:
!nvidia-smi

Tue Jun 11 10:15:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.146.02             Driver Version: 535.146.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 ...    Off | 00000000:03:00.0 Off |                  N/A |
| 27%   32C    P8               1W / 250W |      1MiB /  8192MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 ...    Off | 00000000:0E:00.0 Off |  

In [96]:
import os
# Assign GPU number
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [97]:
import json
import os
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
import numpy as np

In [98]:
def load_json(file_path):
    with open(file_path, 'r') as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            data = []
            f.seek(0)
            for line in f:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON in line: {line}\nError: {e}")
    return data

In [99]:
train_file_path = 'hin_train.json'

In [100]:
if os.path.exists(train_file_path):
    
    train_data = load_json(train_file_path)

    def print_summary(data, name):
        print(f"{name} Data Loaded:")
        print(f"Number of records: {len(data)}")
        if isinstance(data, list) and len(data) > 0:
            print("First record:", data[0])
        elif isinstance(data, dict):
            print("Keys:", list(data.keys()))

    print_summary(train_data, "Train")

Train Data Loaded:
Number of records: 1299155
First record: {'unique_identifier': 'hin1', 'native word': 'जन्मदिवस', 'english word': 'janamdivas', 'source': 'Dakshina', 'score': None}


In [101]:
keys_to_keep = ['native word', 'english word']

def filter_keys(data, keys_to_keep):
    filtered_data = [{key: item[key] for key in keys_to_keep if key in item} for item in data]
    return filtered_data

# def print_sample(data, name, sample_size=5):
#     print(f"Filtered {name} Data Sample:")
#     for item in data[:sample_size]:
#         print(item)

train_data_filtered = filter_keys(train_data, keys_to_keep)
train = list(train_data_filtered)

# print_sample(train, "Train")

In [102]:
train = pd.DataFrame(train)

train.rename(columns={'native word': 'hindi', 'english word': 'english'}, inplace=True)
train.head()

Unnamed: 0,hindi,english
0,जन्मदिवस,janamdivas
1,रक्खा,rakha
2,मिलीजुली,milijuli
3,जांचों,jaanchon
4,चमकता,chamkata


In [103]:
train.shape

(1299155, 2)

In [104]:
train['hindi'] = train['hindi'].astype(str)
train['english'] = train['english'].astype(str)

train['hindi'] = train['hindi'].str.strip()
train['english'] = train['english'].str.strip().str.lower()

train = train[~train['english'].str.contains('\.')]
train = train[~train['english'].str.contains(',')]

In [105]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

train = train[train['english'].apply(isEnglish)]

sos = '^'
eos = '$'
train['english'] = sos + train['english'].astype(str) + eos
train['hindi'] = sos + train['hindi'].astype(str) + eos

print(train.head())

        hindi       english
0  ^जन्मदिवस$  ^janamdivas$
1     ^रक्खा$       ^rakha$
2  ^मिलीजुली$    ^milijuli$
3    ^जांचों$    ^jaanchon$
4     ^चमकता$    ^chamkata$


In [106]:
english_text = train['english']
hindi_text = train['hindi']

print('english text shape :', english_text.shape)
print(english_text[0])
print('hindi text shape :', hindi_text.shape)
print(hindi_text[0])

english text shape : (1299155,)
^janamdivas$
hindi text shape : (1299155,)
^जन्मदिवस$


In [107]:
train_english_text, val_english_text, train_hindi_text, val_hindi_text = train_test_split(english_text, hindi_text, test_size=0.2, random_state=42)

In [108]:
# Calculate max input and target lengths
max_input_len = max([len(text) for text in english_text])
print(max_input_len)
max_target_len = max([len(text) for text in hindi_text])
print(max_target_len)

34
30


In [109]:
input_chars = sorted(list(set(''.join(english_text))))
print(input_chars)
target_chars = sorted(list(set(''.join(hindi_text))))
print(target_chars)

num_encoder_tokens = len(input_chars)
print(num_encoder_tokens)
num_decoder_tokens = len(target_chars)
print(num_decoder_tokens)

['$', '^', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['$', '^', 'ँ', 'ं', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ऍ', 'ए', 'ऐ', 'ऑ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'ळ', 'व', 'श', 'ष', 'स', 'ह', '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॅ', 'े', 'ै', 'ॉ', 'ॊ', 'ो', 'ौ', '्', 'ॐ', 'ॠ', '॥']
28
71


In [110]:
input_char_index = dict((char, i) for i, char in enumerate(input_chars))
print(input_char_index)
target_char_index = dict((char, i) for i, char in enumerate(target_chars))
print(target_char_index)

{'$': 0, '^': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, 'i': 10, 'j': 11, 'k': 12, 'l': 13, 'm': 14, 'n': 15, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27}
{'$': 0, '^': 1, 'ँ': 2, 'ं': 3, 'ः': 4, 'अ': 5, 'आ': 6, 'इ': 7, 'ई': 8, 'उ': 9, 'ऊ': 10, 'ऋ': 11, 'ऍ': 12, 'ए': 13, 'ऐ': 14, 'ऑ': 15, 'ओ': 16, 'औ': 17, 'क': 18, 'ख': 19, 'ग': 20, 'घ': 21, 'ङ': 22, 'च': 23, 'छ': 24, 'ज': 25, 'झ': 26, 'ञ': 27, 'ट': 28, 'ठ': 29, 'ड': 30, 'ढ': 31, 'ण': 32, 'त': 33, 'थ': 34, 'द': 35, 'ध': 36, 'न': 37, 'प': 38, 'फ': 39, 'ब': 40, 'भ': 41, 'म': 42, 'य': 43, 'र': 44, 'ल': 45, 'ळ': 46, 'व': 47, 'श': 48, 'ष': 49, 'स': 50, 'ह': 51, '़': 52, 'ऽ': 53, 'ा': 54, 'ि': 55, 'ी': 56, 'ु': 57, 'ू': 58, 'ृ': 59, 'ॅ': 60, 'े': 61, 'ै': 62, 'ॉ': 63, 'ॊ': 64, 'ो': 65, 'ौ': 66, '्': 67, 'ॐ': 68, 'ॠ': 69, '॥': 70}


In [111]:
encoder_input_data = np.zeros((len(train_english_text), max_input_len, num_encoder_tokens), dtype='float32')
decoder_input_data = np.zeros((len(train_hindi_text), max_target_len, num_decoder_tokens), dtype='float32')
decoder_target_data = np.zeros((len(train_hindi_text), max_target_len, num_decoder_tokens), dtype='float32')

for i, (input_text, target_text) in enumerate(zip(train_english_text, train_hindi_text)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_char_index[char]] = 1
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_char_index[char]] = 1
        if t > 0:
            decoder_target_data[i, t - 1, target_char_index[char]] = 1

In [112]:
def custom_accuracy(y_true, y_pred):
    y_true_argmax = tf.argmax(y_true, axis=-1)
    y_pred_argmax = tf.argmax(y_pred, axis=-1)
    mask = tf.cast(tf.not_equal(y_true_argmax, 0), dtype=tf.float32)
    matches = tf.cast(tf.equal(y_true_argmax, y_pred_argmax), dtype=tf.float32)
    masked_matches = matches * mask
    accuracy = tf.reduce_sum(masked_matches) / tf.reduce_sum(mask)
    return accuracy

def custom_loss(y_true, y_pred):
    mask = tf.cast(tf.not_equal(tf.reduce_sum(y_true, axis=-1), 0), dtype=tf.float32)
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
    masked_loss = loss * mask
    return tf.reduce_sum(masked_loss) / tf.reduce_sum(mask)


In [113]:
model = load_model('D1engHin_model.h5', custom_objects={'custom_loss': custom_loss, 'custom_accuracy':custom_accuracy})










In [114]:
# Build the model
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(256, return_state=True, dropout=0.3, recurrent_dropout=0.3)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True, dropout=0.3, recurrent_dropout=0.3)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss=custom_loss, metrics=[custom_accuracy])

model.summary()









Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, None, 28)]   0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, None, 71)]   0                                            
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, 256), (None, 291840      input_9[0][0]                    
__________________________________________________________________________________________________
lstm_5 (LSTM)                   [(None, None, 256),  335872      input_10[0][0]                   
                                                                 lstm_4[0][1]               

In [115]:
# history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
#           batch_size=128,
#           epochs=50,
#           validation_split=0.2)

In [116]:
model.load_weights('D1engHin_model.h5')

In [117]:
# import tensorflow as tf

# #load h5 module
# model=tf.keras.models.load_model('D1engHin_model.h5', custom_objects={'custom_loss': custom_loss, 'custom_accuracy':custom_accuracy})
# tflite_converter = tf.lite.TFLiteConverter.from_keras_model(model)

# #convert
# tflite_model = tflite_converter.convert()
# open("D1engHin_model.tflite", "wb").write(tflite_model)

In [118]:
val_encoder_input_data = np.zeros((len(val_english_text), max_input_len, num_encoder_tokens), dtype='float32')
val_decoder_input_data = np.zeros((len(val_hindi_text), max_target_len, num_decoder_tokens), dtype='float32')
val_decoder_target_data = np.zeros((len(val_hindi_text), max_target_len, num_decoder_tokens), dtype='float32')

for i, (input_text, target_text) in enumerate(zip(val_english_text, val_hindi_text)):
    for t, char in enumerate(input_text):
        val_encoder_input_data[i, t, input_char_index[char]] = 1
    for t, char in enumerate(target_text):
        val_decoder_input_data[i, t, target_char_index[char]] = 1
        if t > 0:
            val_decoder_target_data[i, t - 1, target_char_index[char]] = 1

In [119]:
scores = model.evaluate([val_encoder_input_data, val_decoder_input_data], val_decoder_target_data, verbose=1)
print('Val loss:', scores[0])
print('Val accuracy:', scores[1])

Val loss: 0.26612356305122375
Val accuracy: 0.9022973775863647


In [120]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [121]:
import numpy as np
import time

def beam_search_decode(input_seq, beam_width=2):
    states_value = encoder_model.predict(input_seq)
    
    start_token_index = target_char_index['^']
    end_token_index = target_char_index['$']
    sequences = [[list(), 0.0, states_value]]  # list of [sequence, score, states]
    
    for _ in range(max_target_len):
        all_candidates = []
        for seq, score, states in sequences:
            target_seq = np.zeros((1, 1, num_decoder_tokens))
            if len(seq) > 0:
                last_token_index = seq[-1]
            else:
                last_token_index = start_token_index
            target_seq[0, 0, last_token_index] = 1.0
            
            output_tokens, h, c = decoder_model.predict([target_seq] + states)
            states_value = [h, c]
            
            top_k_indices = np.argsort(output_tokens[0, -1, :])[-beam_width:]
            for token_index in top_k_indices:
                candidate = [seq + [token_index], score - np.log(output_tokens[0, -1, token_index] + 1e-10), states_value]
                all_candidates.append(candidate)
        
        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:beam_width]
    
    # Convert all top sequences to decoded sentences and calculate their probabilities
    decoded_sentences = []
    for seq, score, _ in sequences:
        decoded_sentence = ''.join([target_chars[idx] for idx in seq])
        if decoded_sentence.endswith('$'):
            decoded_sentence = decoded_sentence.split('$', 1)[0]  # Stop at first end token
        probability = np.exp(-score)  # Convert negative log probability back to probability
        
        decoded_sentences.append((decoded_sentence, f'{probability:.4f}'))
    
    return decoded_sentences


In [133]:
def predict_output(input_text, beam_width=2):
    total_latency = 0
    word_count = 0
    results = []
    final_sentence = []
    words = input_text.split(' ')
    for word in words:
        # Add start and end symbols to the input word
        word = '^' + word.lower() + '$'
        
        input_seq = np.zeros((1, max_input_len, len(input_chars)), dtype='float32')
        for t, char in enumerate(word):
            if char in input_char_index:
                input_seq[0, t, input_char_index[char]] = 1
        
        start_time = time.time()  # Start measuring time for the current word
        decoded_sentences = beam_search_decode(input_seq, beam_width)
        end_time = time.time()  # End measuring time for the current word
        latency = end_time - start_time

        results.append(decoded_sentences)  # Collect all beam search results
        total_latency += latency
        word_count += 1

        # Select the highest probability transliteration
        best_transliteration = max(decoded_sentences, key=lambda x: float(x[1]))
        final_sentence.append(best_transliteration[0])
    
    # Join the final sentence words
    final_sentence_str = ' '.join(final_sentence)
    
    average_latency = total_latency / word_count if word_count > 0 else 0
    print(f'Average latency: {average_latency:.4f} seconds')
    
    return final_sentence_str, results


In [134]:
text = 'Shubhkamna'
print('English text:',text)
print('Transliterated Hindi text:',predict_output(text))

English text: Shubhkamna
Average latency: 1.4573 seconds
Transliterated Hindi text: ('शुभकामना', [[('शुभकामना', '0.2648'), ('शुभकम्न', '0.1149')]])


In [124]:
text = 'Rishabh'
print('English text:',text)
print('Transliterated Hindi text:',predict_output(text))

English text: Rishabh
Average latency: 1.3863 seconds
Transliterated Hindi text: ('ऋषभ', [[('ऋषभ', '0.4465'), ('रिशभ', '0.0413')]])


In [125]:
text = 'Akshat'
print('English text:',text)
print('Transliterated Hindi text:',predict_output(text))

English text: Akshat
Average latency: 1.3936 seconds
Transliterated Hindi text: ('अक्षत', [[('अक्षत', '0.6551'), ('अक्षत', '0.0172')]])


In [126]:
text = 'Aaj Hum sab ghumne jaayenge'
print('English text:',text)
print('Transliterated Hindi text:',predict_output(text))

English text: Aaj Hum sab ghumne jaayenge
Average latency: 1.3996 seconds
Transliterated Hindi text: ('आज हम सब घुमने जायेंगे', [[('आज', '0.6285'), ('आज', '0.0022')], [('हम', '0.0018'), ('हुम', '0.0016')], [('सब', '0.0055'), ('सब', '0.0050')], [('घुमने', '0.3715'), ('घुमने', '0.2759')], [('जायेंगे', '0.2963'), ('जाएँगे', '0.2188')]])


In [127]:
text = 'Abhinav'
print('English text:',text)
print('Transliterated Hindi text:',predict_output(text))

English text: Abhinav
Average latency: 1.8303 seconds
Transliterated Hindi text: ('अभिनाव', [[('अभिनाव', '0.3359'), ('अभिनव', '0.3025')]])


In [128]:
text = 'Hindustan'
print('English text:',text)
print('Transliterated Hindi text:',predict_output(text))

English text: Hindustan
Average latency: 1.4069 seconds
Transliterated Hindi text: ('हिंदुस्तान', [[('हिंदुस्तान', '0.2825'), ('हिन्दुस्तान', '0.2779')]])


In [129]:
text = 'Aman'
print('English text:',text)
print('Transliterated Hindi text:',predict_output(text))

English text: Aman
Average latency: 1.3987 seconds
Transliterated Hindi text: ('अमान', [[('अमान', '0.0108'), ('अमन', '0.0041')]])
