In [1]:
%cd /content/drive/MyDrive/Colab/indicate/

/content/drive/MyDrive/Colab/indicate


In [2]:
import os
import json

import pandas as pd
import tensorflow as tf

In [3]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    ##-------- LSTM layer in Encoder ------- ##
    self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')



  def call(self, x, hidden, training=None):
    x = self.embedding(x)
    output, h, c = self.lstm_layer(x, initial_state = hidden)
    return output, h, c

  def initialize_hidden_state(self):
    return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]

In [4]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz,
                 max_length_input, max_length_output, attention_type='luong'):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.attention_type = attention_type
        self.max_length_input = max_length_input
        self.max_length_output = max_length_output

        # Embedding Layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        # Final Dense layer on which softmax will be applied
        self.fc = tf.keras.layers.Dense(vocab_size)

        # Define the fundamental cell for decoder recurrent structure
        self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)
        self.decoder_rnn = tf.keras.layers.RNN(self.decoder_rnn_cell, return_sequences=True, return_state=True)

        # Create attention mechanism
        # For Luong attention: project the query to match the encoder output's dimension (dec_units)
        if self.attention_type == 'luong':
            self.query_layer = tf.keras.layers.Dense(dec_units)

        self.attention_mechanism = self.build_attention_mechanism(None)

    def build_attention_mechanism(self, memory):
        if self.attention_type == 'bahdanau':
            return tf.keras.layers.AdditiveAttention()
        else:
            return tf.keras.layers.Attention()

    def call(self, inputs, initial_state, encoder_outputs):
        # Get the embeddings of the inputs
        x = self.embedding(inputs)

        if self.attention_type == 'luong':
          query = self.query_layer(x)
        else:
          query = x

        # Feature dimensions to the attention layer
        attention_output, attention_weights = self.attention_mechanism([query, encoder_outputs], return_attention_scores=True)

        # Concatenate the attention output with the LSTM Cell output
        lstm_input = tf.concat([attention_output, x], axis=-1)

        # Process through the LSTM cell
        outputs, state_h, state_c = self.decoder_rnn(lstm_input, initial_state)

        # Pass through the final dense layer
        outputs = self.fc(outputs)

        return outputs, (state_h, state_c), attention_weights

    def build_initial_state(self, batch_sz, encoder_state):
        hidden_state, cell_state = encoder_state
        return [hidden_state, cell_state]

# Hyperparameters

In [35]:
embedding_dim = 256
units = 1024
BATCH_SIZE = 64
BUFFER_SIZE = 120000

max_length_input = 47
max_length_output = 173

# Load Vocab

In [6]:
%ls

[0m[01;34mdata[0m/       encoder.pt           Inference_checkpoints_latest.ipynb  Train_tf_latest.ipynb
data.ipynb  english_tokens.json  train.ipynb                         train_w_lstm.ipynb
decoder.pt  hindi_tokens.json    train_old_data.ipynb                train_w_transformers.ipynb


In [7]:
with open('hindi_tokens.json') as f:
    data = json.load(f)
    input_lang_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

with open('english_tokens.json') as f:
    data = json.load(f)
    target_lang_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

vocab_inp_size = len(input_lang_tokenizer.word_index)+1
vocab_tar_size = len(target_lang_tokenizer.word_index)+1

In [36]:
print(f"input vocab size - {vocab_inp_size}")
print(f"target vocab size - {vocab_tar_size}")

print(f"max length of input - {max_length_input}")
print(f"max length of target - {max_length_output}")

input vocab size - 200
target vocab size - 137
max length of input - 47
max length of target - 173


# Encoder, Decoder, optimizer

In [71]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE,
                  max_length_input, max_length_output, 'luong')
optimizer = tf.keras.optimizers.Adam()

In [72]:
encoder.embedding.variables

[]

In [73]:
decoder.embedding.variables

[]

In [74]:
# sample input
example_input_batch = tf.random.uniform(shape=[64,47])
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

Encoder output shape: (batch size, sequence length, units) (64, 47, 1024)
Encoder h vecotr shape: (batch size, units) (64, 1024)
Encoder c vector shape: (batch size, units) (64, 1024)


In [75]:
encoder.embedding.variables

[<Variable path=encoder_3/embedding_6/embeddings, shape=(200, 256), dtype=float32, value=[[-0.00686686 -0.0010526   0.03595546 ...  0.04294099 -0.01272945
    0.00456731]
  [ 0.00348008 -0.03233007 -0.03155342 ...  0.02613237 -0.00144179
    0.02792479]
  [-0.00382755 -0.03357872 -0.0355198  ...  0.02060625  0.04350809
   -0.03944601]
  ...
  [ 0.02380589  0.04296638  0.01664678 ... -0.03105517  0.02877896
    0.03039264]
  [ 0.03506743 -0.00562978 -0.04282135 ... -0.01053991  0.0236184
    0.04497857]
  [ 0.0213498   0.01751263  0.02058559 ... -0.03325355  0.03340849
   -0.04312873]]>]

In [76]:
# Test decoder stack
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c])


sample_decoder_outputs = decoder(sample_x, initial_state, sample_output)
logits, new_state_h, new_state_c = sample_decoder_outputs

print("Decoder Outputs Shape: ", sample_decoder_outputs[0].shape)

Decoder Outputs Shape:  (64, 173, 137)


In [77]:
decoder.embedding.variables

[<Variable path=decoder_3/embedding_7/embeddings, shape=(137, 256), dtype=float32, value=[[-0.00802499  0.03800768 -0.04571027 ... -0.04793583  0.01777229
    0.00134502]
  [ 0.02571804 -0.0299448  -0.00607223 ... -0.02471362  0.00100492
   -0.0391422 ]
  [-0.04521506 -0.015232   -0.01456056 ...  0.02043522  0.04228887
   -0.01479077]
  ...
  [-0.00173284  0.00543803 -0.02407183 ... -0.04090531 -0.0238791
   -0.00550655]
  [ 0.0098604  -0.01415495  0.00713523 ...  0.03206344  0.04017801
   -0.04943197]
  [-0.0247746   0.01139496 -0.0258055  ... -0.02702575  0.03122982
    0.03643204]]>]

# Restore the latest checkpoint

In [78]:
checkpoint_dir = './data/training_checkpoints'
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [79]:
tf.train.latest_checkpoint(checkpoint_dir)

'./data/training_checkpoints/ckpt-5'

In [80]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f18bef3dfd0>

In [81]:
encoder.embedding.variables

[<Variable path=encoder_3/embedding_6/embeddings, shape=(200, 256), dtype=float32, value=[[ 0.26681638  0.2754885  -0.03031236 ...  0.22623836  0.27917588
    0.297866  ]
  [-0.04418039  0.02793723  0.11009125 ...  0.00912727 -0.23427705
    0.12682076]
  [ 0.0630974  -0.01531512  0.04576264 ...  0.12889968  0.08963458
    0.15908036]
  ...
  [ 0.06958606  0.07699304  0.01560043 ... -0.01147923 -0.00868758
    0.04236265]
  [ 0.05035917 -0.08790015 -0.06968533 ...  0.0825654   0.11627398
   -0.05767357]
  [-0.0197444  -0.02649814  0.03845655 ...  0.03345997  0.03370145
   -0.03743879]]>]

In [82]:
decoder.embedding.variables

[<Variable path=decoder_3/embedding_7/embeddings, shape=(137, 256), dtype=float32, value=[[-0.03412235 -0.00510821  0.04777941 ...  0.03236498  0.01061462
    0.00176196]
  [ 0.14992759 -0.07939371 -0.24454711 ... -0.29890257  0.04763342
    0.10742956]
  [ 0.0089919  -0.0043212   0.0303094  ...  0.00179068  0.00056501
   -0.04161235]
  ...
  [ 0.03066176 -0.05368591  0.0289726  ... -0.04912771  0.06342489
   -0.02452039]
  [-0.10702807  0.1304991  -0.04743847 ... -0.00367014 -0.04885023
    0.02426663]
  [ 0.07773516 -0.00087182 -0.04214528 ... -0.05326953 -0.01027754
    0.0061043 ]]>]

In [49]:
encoder.summary()

In [50]:
def sequence_to_chars(tokenizer, sequence):
    """Convert a sequence of indices back to characters."""
    word_index = tokenizer.word_index
    reverse_map = {val: key for key, val in word_index.items()}
    retext = ''

    # Convert tensor to numpy array if it's a tensor
    if tf.is_tensor(sequence):
        sequence = sequence.numpy()

    for q in sequence:
        q_int = int(q)  # Convert to integer
        if q_int != 0:  # Skip padding token
            retext += reverse_map[q_int]
    return retext

def evaluate_sentence(sentence, units, input_lang_tokenizer, target_lang_tokenizer,
                     encoder, decoder, max_length_input):
    """Evaluate/translate a single sentence."""
    # Convert input sentence to token indices
    inputs = [input_lang_tokenizer.word_index[i] for i in sentence]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_input,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    # Use the same batch size as decoder.batch_sz for consistency
    inference_batch_size = inputs.shape[0]

    inputs = tf.tile(inputs, [inference_batch_size, 1])  # Replicate input to match batch size

    # Initialize encoder state
    enc_start_state = [tf.zeros((inference_batch_size, units)),
                      tf.zeros((inference_batch_size, units))]

    # Get encoder output
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    # Initialize decoder with encoder's final state
    dec_state = [enc_h, enc_c]

    # Prepare decoder input with start token
    dec_input = tf.fill([inference_batch_size, 1],
                       target_lang_tokenizer.word_index['^'])

    outputs = []

    # Decoding loop
    for t in range(decoder.max_length_output):
        # Call decoder
        predictions, dec_state, _ = decoder(dec_input, dec_state, enc_out)

        # Get predicted token (use first batch item since all are the same)
        predicted_id = tf.argmax(predictions, axis=-1).numpy()[0, -1]

        # Append prediction
        outputs.append(predicted_id)

        # Break if end token is predicted
        if predicted_id == target_lang_tokenizer.word_index['$']:
            break

        # Update decoder input
        dec_input = tf.fill([inference_batch_size, 1], predicted_id)

    return tf.convert_to_tensor(outputs)


def translate(sentence, units, input_lang_tokenizer, target_lang_tokenizer,
             encoder, decoder, max_length_input):
    """Translate a sentence from source to target language."""
    result = evaluate_sentence(sentence,
                             units,
                             input_lang_tokenizer,
                             target_lang_tokenizer,
                             encoder,
                             decoder,
                             max_length_input)

    # Convert the output tokens back to characters
    translated_text = sequence_to_chars(target_lang_tokenizer, result)

    # Remove the end token and return
    return translated_text.strip('$')

In [51]:
%pwd

'/content/drive/MyDrive/Colab/indicate'

In [52]:
encoder.save_weights('./data/final_model/encoder.weights.h5')

In [53]:
decoder.save_weights('./data/final_model/decoder.weights.h5')

In [54]:
!du -sh ./data/final_model/

58M	./data/final_model/


# Basic Decoder

In [83]:
test_input = 'राजशेखर'
translate(test_input, units, input_lang_tokenizer, target_lang_tokenizer,
          encoder, decoder, max_length_input)

'rajshekhar'

In [84]:
test_input = 'चिंतालपति'
translate(test_input, units, input_lang_tokenizer, target_lang_tokenizer,
          encoder, decoder, max_length_input)

'chintalpati'

In [85]:
test_input = 'गौरव'
translate(test_input, units, input_lang_tokenizer, target_lang_tokenizer,
          encoder, decoder, max_length_input)

'gaurav'

In [86]:
test_input = 'सूद'
translate(test_input, units, input_lang_tokenizer, target_lang_tokenizer,
          encoder, decoder, max_length_input)

'sood'

In [88]:
# load h5 weights
encoder_h = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder_h = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE,
                  max_length_input, max_length_output, 'luong')

In [89]:
example_input_batch = tf.random.uniform(shape=[64,47])
sample_hidden = encoder_h.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder_h(example_input_batch, sample_hidden)

encoder_h.load_weights('./data/final_model/encoder.weights.h5')

In [90]:
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
initial_state = decoder_h.build_initial_state(BATCH_SIZE, [sample_h, sample_c])

sample_decoder_outputs = decoder_h(sample_x, initial_state, sample_output)
logits, new_state_h, new_state_c = sample_decoder_outputs

decoder_h.load_weights('./data/final_model/decoder.weights.h5')

In [91]:
test_inputs = ["राजशेखर","चिंतालपति","गौरव","सूद"]
for test_input in test_inputs:
  print(translate(test_input, units, input_lang_tokenizer, target_lang_tokenizer,
          encoder_h, decoder_h, max_length_input))

rajshekhar
chintalpati
gaurav
sood


In [92]:
# lets test above with df with columns as names from test inputs
test_df = pd.DataFrame(test_inputs, columns=['hindi'])
test_df

Unnamed: 0,hindi
0,राजशेखर
1,चिंतालपति
2,गौरव
3,सूद


In [93]:
# add new column english to test_df
test_df['english'] = test_df['hindi'].apply(lambda x: translate(x, units, input_lang_tokenizer, target_lang_tokenizer,
          encoder_h, decoder_h, max_length_input))

In [94]:
test_df

Unnamed: 0,hindi,english
0,राजशेखर,rajshekhar
1,चिंतालपति,chintalpati
2,गौरव,gaurav
3,सूद,sood


# Validation on Dakshina Test Dataset

In [None]:
dakshina_test_df = pd.read_csv('./data/hi.translit.sampled.test.tsv', sep='\t', names=["hindi", "english", "freq"])

In [None]:
from tqdm import tqdm
tqdm.pandas()

predicted = []

def predict(hin):
  predicted.append(translate(hin, units, input_lang_tokenizer, target_lang_tokenizer,
          encoder, decoder, max_length_input))

dakshina_test_df.progress_apply(lambda x: predict(x['hindi']), axis=1)

100%|██████████| 4502/4502 [09:04<00:00,  8.27it/s]


Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
4497,
4498,
4499,
4500,


In [None]:
dakshina_test_df.head()

Unnamed: 0,hindi,english,freq
0,अंक,ank,5
1,अंक,anka,1
2,अंकित,ankit,3
3,अंकों,anakon,1
4,अंकों,ankhon,1


In [None]:
def edit_distance(s1, s2):
  t1 = tf.sparse.from_dense(tf.expand_dims(tf.strings.bytes_split(s1), axis=0))
  t2 = tf.sparse.from_dense(tf.expand_dims(tf.strings.bytes_split(s2), axis=0))
  distance = tf.edit_distance(t1, t2)
  return distance[0].numpy()

model_edit_dist = []

def edit_dist(x):
  model_edit_dist.append(edit_distance(x['model_predicted'], x['english']))

dakshina_test_df['model_predicted'] = predicted
dakshina_test_df.progress_apply(lambda x: edit_dist(x), axis=1)
dakshina_test_df['model_edit_distance'] = model_edit_dist

model_edit_dedup = dakshina_test_df.groupby(['hindi'])['model_edit_distance'].min().reset_index()
model_acc_percent = len(model_edit_dedup[model_edit_dedup['model_edit_distance'] == 0])/len(model_edit_dedup)
print(f"Out of {len(model_edit_dedup)}, model predicted {model_acc_percent * 100}% correctly")

100%|██████████| 4502/4502 [01:07<00:00, 66.45it/s]

Out of 2500, model predicted 78.44% correctly



