In [None]:
%cd /content/drive/MyDrive/Colab/indicate/

/content/drive/MyDrive/Colab/indicate


In [None]:
import json
import os
import time

import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    ##-------- LSTM layer in Encoder ------- ##
    self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')



  def call(self, x, hidden, training=None):
    x = self.embedding(x)
    output, h, c = self.lstm_layer(x, initial_state = hidden)
    return output, h, c

  def initialize_hidden_state(self):
    return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz,
                 max_length_input, max_length_output, attention_type='luong'):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.attention_type = attention_type
        self.max_length_input = max_length_input
        self.max_length_output = max_length_output

        # Embedding Layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        # Final Dense layer on which softmax will be applied
        self.fc = tf.keras.layers.Dense(vocab_size)

        # Define the fundamental cell for decoder recurrent structure
        self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)
        self.decoder_rnn = tf.keras.layers.RNN(self.decoder_rnn_cell, return_sequences=True, return_state=True)

        # Create attention mechanism
        # For Luong attention: project the query to match the encoder output's dimension (dec_units)
        if self.attention_type == 'luong':
            self.query_layer = tf.keras.layers.Dense(dec_units)

        self.attention_mechanism = self.build_attention_mechanism(None)

    def build_attention_mechanism(self, memory):
        if self.attention_type == 'bahdanau':
            return tf.keras.layers.AdditiveAttention()
        else:
            return tf.keras.layers.Attention()

    def call(self, inputs, initial_state, encoder_outputs):
        # Get the embeddings of the inputs
        x = self.embedding(inputs)

        if self.attention_type == 'luong':
          query = self.query_layer(x)
        else:
          query = x

        # Feature dimensions to the attention layer
        attention_output, attention_weights = self.attention_mechanism([query, encoder_outputs], return_attention_scores=True)

        # Concatenate the attention output with the LSTM Cell output
        lstm_input = tf.concat([attention_output, x], axis=-1)

        # Process through the LSTM cell
        outputs, state_h, state_c = self.decoder_rnn(lstm_input, initial_state)

        # Pass through the final dense layer
        outputs = self.fc(outputs)

        return outputs, (state_h, state_c), attention_weights

    def build_initial_state(self, batch_sz, encoder_state):
        hidden_state, cell_state = encoder_state
        return [hidden_state, cell_state]

In [None]:
# Generate text from sequence (mapping back)
def sequence_to_chars(tokenizer, sequence):
    word_index = tokenizer.word_index
    reverse_map = {val: key for key, val in word_index.items()}
    retext = ''
    for q in sequence:
        if q != 0:
            retext += reverse_map[q]
    return retext

def evaluate_sentence(sentence, units, input_lang_tokenizer, target_lang_tokenizer,
                      encoder, decoder, max_length_input):
    # sentence = '^' + sentence.strip() + '$'
    inputs = [input_lang_tokenizer.word_index[i] for i in sentence]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                            maxlen=max_length_input,
                                                            padding='post')
    inputs = tf.convert_to_tensor(inputs)
    inference_batch_size = inputs.shape[0]
    result = ''

    enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size, units))]
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    dec_h = enc_h
    dec_c = enc_c

    start_tokens = tf.fill([inference_batch_size], target_lang_tokenizer.word_index['^'])
    end_token = target_lang_tokenizer.word_index['$']

    # Use a simple greedy decoding method
    outputs = []
    for t in range(decoder.max_length_output):
        # The current input is the last output from previous time step
        if t == 0:
            input_dec = start_tokens
        else:
            input_dec = tf.expand_dims(sample_ids, 1)

        # Call the decoder
        predictions, (dec_h, dec_c), _ = decoder(input_dec, (dec_h, dec_c), enc_out)

        # Sample the next token
        sample_ids = tf.argmax(predictions, axis=-1).numpy()[:, -1]
        outputs.append(sample_ids)

    return tf.concat(outputs, axis=0)

def translate(sentence, units, input_lang_tokenizer, target_lang_tokenizer, encoder, decoder, max_length_input):
    result = evaluate_sentence(sentence, units, input_lang_tokenizer, target_lang_tokenizer, encoder, decoder, max_length_input)
    result = sequence_to_chars(target_lang_tokenizer, result[0])
    return result.strip('$')

In [None]:
print(tf.__version__)

2.18.0


# Load Dataset

In [None]:
input_target_df = pd.read_csv('./data/hindi.csv')

# Prepare Data for training

## Append start and end of sequence for target

In [None]:
# append start and end of sequence for target
sos = '^'
eos = '$'

input_target_df['english'] = sos + input_target_df['english'].astype(str) + eos

In [None]:
input_target_df.head()

Unnamed: 0,hindi,english
0,प्रयात सुबोध चन्द्र दास,^late subodh chandra das$
1,स्वर्गीय पिनाकि कुंडू,^late pinaki kundu$
2,मृत नूर महम्मद बिश्वास,^late nur mohammad biswas$
3,नारायण चंद्र डॉं,^narayan chandra dawn$
4,आब्दुल मन्नान,^abdul mannan$


## Construct Vocab for Source and Target

In [None]:
input_words =  input_target_df['hindi'].tolist()
target_words =  input_target_df['english'].tolist()

In [None]:
print(f"Total number of input words {len(input_words)}")
print(f"Total number of target words {len(target_words)}")

Total number of input words 371035
Total number of target words 371035


In [None]:
input_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True)
input_lang_tokenizer.fit_on_texts(input_words)
input_tensor = input_lang_tokenizer.texts_to_sequences(input_words)
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, padding='post')

In [None]:
target_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True)
target_lang_tokenizer.fit_on_texts(target_words)
target_tensor = target_lang_tokenizer.texts_to_sequences(target_words)
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='post')

## Save Vocab for Source and Target

In [None]:
# save tokenizers
hindi_tokenizer_json = input_lang_tokenizer.to_json()
with open('hindi_tokens.json', 'w') as f:
  json.dump(hindi_tokenizer_json, f)

english_tokenizer_json = target_lang_tokenizer.to_json()
with open('english_tokens.json', 'w') as f:
  json.dump(english_tokenizer_json, f)

## Split data into Train and Validation

In [None]:
# split train and validation
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = 120000

train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 47]), TensorShape([64, 173]))

## Summary of prepared dataset

In [None]:
vocab_inp_size = len(input_lang_tokenizer.word_index)+1
vocab_tar_size = len(target_lang_tokenizer.word_index)+1

max_length_input = input_tensor.shape[1]
max_length_output = target_tensor.shape[1]

embedding_dim = 256
units = 1024
steps_per_epoch = input_tensor.shape[0]//BATCH_SIZE

In [None]:
print('Number of samples:', len(input_words))

print('Input Vocab size:', vocab_inp_size)
print('Target Vocab size:', vocab_tar_size)

print('Max sequence length for inputs:', max_length_input)
print('Max sequence length for outputs:', max_length_output)

Number of samples: 371035
Input Vocab size: 200
Target Vocab size: 137
Max sequence length for inputs: 47
Max sequence length for outputs: 173


# Encoder

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

## Test Encoder

In [None]:
## Test Encoder Stack
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))
print('Encoder output vector shape: (batch size, sequence length, units) {}'.format(sample_output.shape))

Encoder output shape: (batch size, sequence length, units) (64, 47, 1024)
Encoder h vecotr shape: (batch size, units) (64, 1024)
Encoder c vector shape: (batch size, units) (64, 1024)
Encoder output vector shape: (batch size, sequence length, units) (64, 47, 1024)


# Decoder

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE,
                  max_length_input, max_length_output, 'luong')

## Test Decoder

In [None]:
# Test decoder stack
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c])


sample_decoder_outputs = decoder(sample_x, initial_state, sample_output)

print("Decoder Outputs Shape: ", sample_decoder_outputs[0].shape)

Decoder Outputs Shape:  (64, 173, 137)


# Optimizer and Loss Function

In [None]:
optimizer = tf.keras.optimizers.Adam()

def loss_function(real, pred):
  # real shape = (BATCH_SIZE, max_length_output)
  # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = cross_entropy(y_true=real, y_pred=pred)
  mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
  mask = tf.cast(mask, dtype=loss.dtype)
  loss = mask* loss
  loss = tf.reduce_mean(loss)
  return loss

# Checkpoints

In [None]:
checkpoint_dir = './data/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# One train_step operations

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_h, enc_c = encoder(inp, enc_hidden)

    # For the decoder, the input is the target sequence excluding the <end> token;
    # and the “real” sequence is target shifted by one (excluding the <start> token).
    dec_input = targ[ : , :-1 ] # Ignore <end> token
    real = targ[ : , 1: ]         # ignore <start> token

    # Build the initial state for the decoder.
    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c])

    # Unpack the outputs from the decoder.
    logits, decoder_state, attention_weights = decoder(dec_input, decoder_initial_state, enc_output)
    # Compute the loss from logits and the true labels.
    loss = loss_function(real, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [None]:
# Validation Step (no gradient tape needed)
@tf.function
def val_step(inp, targ, enc_hidden):
  # Run the encoder on the validation input.
  enc_output, enc_h, enc_c = encoder(inp, enc_hidden)
  # Prepare the decoder input and targets as in training.
  dec_input = targ[:, :-1]
  real = targ[:, 1:]
  # Build the initial state for the decoder.
  decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c])
  # Get the logits from the decoder.
  logits, _, _ = decoder(dec_input, decoder_initial_state, enc_output)
  # Compute and return the loss.
  loss = loss_function(real, logits)
  return loss

# Train

In [None]:
EPOCHS = 25
patience = 5
epochs_without_improvement = 0
best_val_loss = float('inf')

best_loss = 1
for epoch in range(EPOCHS):
  start = time.time()

  # -- Training phase --
  enc_hidden = encoder.initialize_hidden_state()
  total_train_loss = 0
  # loop over the training batches
  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_train_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))

  loss = total_train_loss / steps_per_epoch


  # -- validation phase --
  total_val_loss = 0
  for (batch, (inp, targ)) in enumerate(val_dataset):
    enc_hidden = encoder.initialize_hidden_state()
    batch_val_loss = val_step(inp, targ, enc_hidden)
    total_val_loss += batch_val_loss

  val_loss = total_val_loss / len(val_dataset)
  print('Epoch {} Loss {:.4f} Val Loss {:.4f}'.format(epoch + 1,
                                                       loss,
                                                       val_loss))

  # -- checkpoint & early stopping --
  if val_loss < best_val_loss:
    best_val_loss = val_loss
    print('Saving checkpoint for epoch {} Loss {:.4f}'.format(epoch + 1, val_loss))
    checkpoint.save(file_prefix = checkpoint_prefix)
    epochs_without_improvement = 0
  else:
    epochs_without_improvement += 1

  # break the loop if there is no improvement
  if epochs_without_improvement >= patience:
    print(f"Early stopping triggered after {patience} consecutive epochs w/o improvement")
    break

  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.2329
Epoch 1 Batch 100 Loss 0.1237
Epoch 1 Batch 200 Loss 0.0989
Epoch 1 Batch 300 Loss 0.0988
Epoch 1 Batch 400 Loss 0.0887
Epoch 1 Batch 500 Loss 0.0944
Epoch 1 Batch 600 Loss 0.0800
Epoch 1 Batch 700 Loss 0.0663
Epoch 1 Batch 800 Loss 0.0466
Epoch 1 Batch 900 Loss 0.0326
Epoch 1 Batch 1000 Loss 0.0208
Epoch 1 Batch 1100 Loss 0.0239
Epoch 1 Batch 1200 Loss 0.0202
Epoch 1 Batch 1300 Loss 0.0175
Epoch 1 Batch 1400 Loss 0.0211
Epoch 1 Batch 1500 Loss 0.0181
Epoch 1 Batch 1600 Loss 0.0248
Epoch 1 Batch 1700 Loss 0.0215
Epoch 1 Batch 1800 Loss 0.0208
Epoch 1 Batch 1900 Loss 0.0168
Epoch 1 Batch 2000 Loss 0.0163
Epoch 1 Batch 2100 Loss 0.0212
Epoch 1 Batch 2200 Loss 0.0224
Epoch 1 Batch 2300 Loss 0.0178
Epoch 1 Batch 2400 Loss 0.0144
Epoch 1 Batch 2500 Loss 0.0170
Epoch 1 Batch 2600 Loss 0.0156
Epoch 1 Batch 2700 Loss 0.0208
Epoch 1 Batch 2800 Loss 0.0179
Epoch 1 Batch 2900 Loss 0.0169
Epoch 1 Batch 3000 Loss 0.0161
Epoch 1 Batch 3100 Loss 0.0189
Epoch 1 Batch 3200 L

In [None]:
encoder.embedding.variables

[<Variable path=encoder_2/embedding_8/embeddings, shape=(166, 256), dtype=float32, value=[[-0.0377021  -0.13381055 -0.11647186 ...  0.02190447 -0.18612249
    0.3392803 ]
  [-0.03409722  0.14195894 -0.0854991  ... -0.07803918 -0.00742843
    0.12786649]
  [ 0.02340595 -0.08440623  0.10921772 ...  0.11545631 -0.07786053
    0.02020929]
  ...
  [ 0.01179477  0.05889428  0.01100738 ...  0.03654375 -0.00295061
   -0.02032111]
  [ 0.09261505  0.00799699 -0.10922566 ...  0.16155736 -0.04701557
   -0.1135793 ]
  [ 0.02345644 -0.14304264  0.14749458 ... -0.05203421  0.19353712
    0.01092249]]>]

In [None]:
decoder.embedding.variables

[<Variable path=decoder_6/embedding_9/embeddings, shape=(36, 256), dtype=float32, value=[[ 0.02532734  0.04688115  0.02036213 ...  0.02987245 -0.01590071
    0.021944  ]
  [-0.24319564  0.12953956 -0.02794108 ... -0.14007922  0.00697453
   -0.16937147]
  [ 0.02966861  0.01044544 -0.01499049 ... -0.01741641 -0.00173
    0.04175289]
  ...
  [ 0.13381338 -0.06536956  0.08742485 ... -0.03918008 -0.1341787
    0.12816505]
  [ 0.10032396 -0.01028896  0.18520994 ...  0.02209266 -0.1323131
    0.08644268]
  [ 0.00735058  0.06271109  0.0967354  ... -0.05375795 -0.10891018
   -0.0059194 ]]>]