In [1]:
pip install tensorflow_addons



In [2]:
%cd /content/drive/MyDrive/Colab/Transliterate/Main

/content/drive/MyDrive/Colab/Transliterate/Main


In [3]:
import json
import os
import time

import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa

from sklearn.model_selection import train_test_split

from encoder import Encoder
from decoder import Decoder

In [4]:
print(tf.__version__)

2.7.0


# Load Dataset

In [5]:
input_target_df = pd.read_csv('./data/merged_hi_to_en.csv')

# Prepare Data for training

## Append start and end of sequence for target

In [6]:
# append start and end of sequence for target
sos = '^'
eos = '$'

input_target_df['english'] = sos + input_target_df['english'].astype(str) + eos

In [7]:
input_target_df.head()

Unnamed: 0,hindi,english
0,प्रयात सुबोध चन्द्र दास,^late subodh chandra das$
1,स्वर्गीय पिनाकि कुंडू,^late pinaki kundu$
2,मृत नूर महम्मद बिश्वास,^late nur mohammad biswas$
3,नारायण चंद्र डॉं,^narayan chandra dawn$
4,आब्दुल मन्नान,^abdul mannan$


## Construct Vocab for Source and Target

In [8]:
input_words =  input_target_df['hindi'].tolist()
target_words =  input_target_df['english'].tolist()

In [9]:
print(f"Total number of input words {len(input_words)}")
print(f"Total number of target words {len(target_words)}")

Total number of input words 116021
Total number of target words 116021


In [10]:
input_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True)
input_lang_tokenizer.fit_on_texts(input_words)
input_tensor = input_lang_tokenizer.texts_to_sequences(input_words)
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, padding='post')

In [11]:
target_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True)
target_lang_tokenizer.fit_on_texts(target_words)
target_tensor = target_lang_tokenizer.texts_to_sequences(target_words)
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='post')

## Save Vocab for Source and Target

In [12]:
# save tokenizers
hindi_tokenizer_json = input_lang_tokenizer.to_json()
with open('hindi_tokens.json', 'w') as f:
  json.dump(hindi_tokenizer_json, f)

english_tokenizer_json = target_lang_tokenizer.to_json()
with open('english_tokens.json', 'w') as f:
  json.dump(english_tokenizer_json, f)

## Split data into Train and Validation

In [13]:
# split train and validation
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [14]:
BATCH_SIZE = 64
BUFFER_SIZE = 120000

train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [15]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 47]), TensorShape([64, 49]))

## Summary of prepared dataset

In [16]:
vocab_inp_size = len(input_lang_tokenizer.word_index)+1
vocab_tar_size = len(target_lang_tokenizer.word_index)+1

max_length_input = input_tensor.shape[1]
max_length_output = target_tensor.shape[1]

embedding_dim = 256
units = 1024
steps_per_epoch = input_tensor.shape[0]//BATCH_SIZE

In [17]:
print('Number of samples:', len(input_words))

print('Input Vocab size:', vocab_inp_size)
print('Target Vocab size:', vocab_tar_size)

print('Max sequence length for inputs:', max_length_input)
print('Max sequence length for outputs:', max_length_output)

Number of samples: 116021
Input Vocab size: 166
Target Vocab size: 36
Max sequence length for inputs: 47
Max sequence length for outputs: 49


# Encoder

In [18]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

## Test Encoder

In [19]:
## Test Encoder Stack
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

Encoder output shape: (batch size, sequence length, units) (64, 47, 1024)
Encoder h vecotr shape: (batch size, units) (64, 1024)
Encoder c vector shape: (batch size, units) (64, 1024)


# Decoder

In [20]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE, 
                  max_length_input, max_length_output, 'luong')

## Test Decoder

In [21]:
# Test decoder stack
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
decoder.attention_mechanism.setup_memory(sample_output)
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c], tf.float32)

sample_decoder_outputs = decoder(sample_x, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

Decoder Outputs Shape:  (64, 48, 36)


# Optimizer and Loss Function

In [22]:
optimizer = tf.keras.optimizers.Adam()

def loss_function(real, pred):
  # real shape = (BATCH_SIZE, max_length_output)
  # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = cross_entropy(y_true=real, y_pred=pred)
  mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
  mask = tf.cast(mask, dtype=loss.dtype)  
  loss = mask* loss
  loss = tf.reduce_mean(loss)
  return loss 

# Checkpoints

In [23]:
checkpoint_dir = './data/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# One train_step operations

In [24]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


    dec_input = targ[ : , :-1 ] # Ignore <end> token
    real = targ[ : , 1: ]         # ignore <start> token

    # Set the AttentionMechanism object with encoder_outputs
    decoder.attention_mechanism.setup_memory(enc_output)

    # Create AttentionWrapperState as initial_state for decoder
    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
    pred = decoder(dec_input, decoder_initial_state)
    logits = pred.rnn_output
    loss = loss_function(real, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

# Train

In [25]:
EPOCHS = 25

best_loss = 1
for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  # print(enc_hidden[0].shape, enc_hidden[1].shape)
  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))

  loss = total_loss / steps_per_epoch
  # save checkpoints with best loss
  if loss < best_loss:
    best_loss = loss
    print('Saving checkpoint for epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.6585
Epoch 1 Batch 100 Loss 0.4614
Epoch 1 Batch 200 Loss 0.2636
Epoch 1 Batch 300 Loss 0.1927
Epoch 1 Batch 400 Loss 0.1635
Epoch 1 Batch 500 Loss 0.1268
Epoch 1 Batch 600 Loss 0.1506
Epoch 1 Batch 700 Loss 0.1183
Epoch 1 Batch 800 Loss 0.1279
Epoch 1 Batch 900 Loss 0.0877
Epoch 1 Batch 1000 Loss 0.1213
Epoch 1 Batch 1100 Loss 0.1009
Epoch 1 Batch 1200 Loss 0.0971
Epoch 1 Batch 1300 Loss 0.1098
Epoch 1 Batch 1400 Loss 0.0898
Saving checkpoint for epoch 1 Loss 0.1385
Epoch 1 Loss 0.1385
Time taken for 1 epoch 227.73123383522034 sec

Epoch 2 Batch 0 Loss 0.0962
Epoch 2 Batch 100 Loss 0.0923
Epoch 2 Batch 200 Loss 0.0889
Epoch 2 Batch 300 Loss 0.0975
Epoch 2 Batch 400 Loss 0.1080
Epoch 2 Batch 500 Loss 0.0906
Epoch 2 Batch 600 Loss 0.0884
Epoch 2 Batch 700 Loss 0.0850
Epoch 2 Batch 800 Loss 0.1165
Epoch 2 Batch 900 Loss 0.1037
Epoch 2 Batch 1000 Loss 0.0831
Epoch 2 Batch 1100 Loss 0.0702
Epoch 2 Batch 1200 Loss 0.0818
Epoch 2 Batch 1300 Loss 0.0765
Epoch 2 Batch 14

KeyboardInterrupt: ignored

In [26]:
encoder.embedding.variables

[<tf.Variable 'encoder/embedding/embeddings:0' shape=(166, 256) dtype=float32, numpy=
 array([[-0.07065372, -0.04891422,  0.01282918, ...,  0.04568648,
         -0.07823983, -0.07764788],
        [-0.09147184,  0.19109847,  0.12537214, ..., -0.01790996,
          0.00141883, -0.18570372],
        [ 0.03353083,  0.1054965 , -0.06611574, ...,  0.01166355,
         -0.11960571, -0.16839524],
        ...,
        [-0.19065775,  0.28302342,  0.07893948, ..., -0.1513373 ,
          0.04492971,  0.12638961],
        [ 0.09505298, -0.44047958,  0.02591725, ..., -0.02426921,
         -0.421986  ,  0.23949109],
        [-0.01180682,  0.13395208,  0.03590768, ...,  0.10349247,
         -0.02466262,  0.2182667 ]], dtype=float32)>]

In [27]:
decoder.embedding.variables

[<tf.Variable 'decoder/embedding_1/embeddings:0' shape=(36, 256) dtype=float32, numpy=
 array([[ 0.03724634, -0.02989709, -0.04813312, ..., -0.00780924,
          0.00926129,  0.04338977],
        [ 0.46307915, -0.22748752, -0.31411505, ...,  0.7708867 ,
         -0.1213175 , -0.47693196],
        [ 0.05063611,  0.08911494, -0.11557912, ...,  0.0591362 ,
         -0.04259794, -0.08504502],
        ...,
        [-0.557036  , -0.32107395,  0.31832972, ...,  0.02268083,
         -0.10818028, -0.12645367],
        [ 0.45082635,  0.03947921, -0.22183114, ..., -0.04993758,
          0.01296891, -0.06116186],
        [-0.00486869, -0.15992434,  0.17243467, ...,  0.11373673,
          0.10106619,  0.16622567]], dtype=float32)>]