In [1]:
BATCH_SIZE = 128

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import string
import re
import os
import random
import pandas as pd
import pickle
import numpy as np
import tensorflow as tf
tf.config.run_functions_eagerly(True)
import tensorflow.keras as keras
from tensorflow.keras import layers

from collections import Counter


In [4]:
processed_data_location = "/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/processed_data"

### Preprocessed data, vectorizers and embedding layers for english and hindi text


In [5]:
with open(processed_data_location + "/train.pkl",'rb') as f:
  train = pickle.load(f)

print("Number of Training examples:", len(train))

Number of Training examples: 1448877


In [6]:
random.seed(10)
random.shuffle(train)
train_size = len(train)
train, valid = train[:int(train_size*0.9)],train[int(train_size*0.9):]


In [7]:
def load_vectorizer(location):
  import pickle
  from_disk = pickle.load(open(location, "rb"))
  vectorizer = layers.TextVectorization.from_config(from_disk['config'])
  vectorizer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
  vectorizer.set_weights(from_disk['weights'])
  return vectorizer

en_fasttext_vectorizer = load_vectorizer('/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/EN_Fasttext_Vectorizer.pkl')
hi_fasttext_vectorizer = load_vectorizer('/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/HI_Fasttext_Vectorizer.pkl')



In [8]:
def load_embedding(location):
  import pickle
  from_disk = pickle.load(open(location, "rb"))
  embedding_layer = layers.Embedding.from_config(from_disk['config'])

  return embedding_layer

en_embedding_layer = load_embedding("/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/en_fasttext_embedding_layer.pkl")
hi_embedding_layer = load_embedding("/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/hi_fasttext_embedding_layer.pkl")


## Tensorflow dataset

In [9]:
def create_tf_dataset(data):
  data = [(text['en'],text['hi']) for text in data]
  tf_data = tf.data.Dataset.from_tensor_slices(data)
  tf_data = tf_data.shuffle(BATCH_SIZE*4).batch(BATCH_SIZE).map(lambda X: (en_fasttext_vectorizer(X[:,0]),hi_fasttext_vectorizer(X[:,1])))
  tf_data = tf_data.map(lambda X_batch_en,X_batch_hi: ((X_batch_en,X_batch_hi[:,:-1]),X_batch_hi[:,1:]) )
  tf_data = tf_data.prefetch(4)

  return tf_data

In [10]:
training_dataset = create_tf_dataset(train)
validation_dataset = create_tf_dataset(valid)




In [11]:
del train,valid

## Model

In [12]:
VOCAB_SIZE = hi_fasttext_vectorizer.vocabulary_size()
HIDDEN_UNITS = 1024

In [13]:
class Encoder(keras.layers.Layer):
  def __init__(self, encoder_units,embedding_layer,**kwargs):
    super().__init__(**kwargs)
    self.encoder_units = encoder_units
    self.embedding = embedding_layer
    self.lstm = keras.layers.LSTM(self.encoder_units,return_state = True,recurrent_initializer='glorot_uniform')

  def call(self,input_tokens):
    input_vectors = self.embedding(input_tokens)
    output,state_1,state_2 = self.lstm(input_vectors)
    states = [state_1,state_2]
    return output,states

In [14]:
class Decoder(keras.layers.Layer):
  def __init__(self, output_vocab_size, decoder_units, embedding_layer, **kwargs):
    super().__init__(**kwargs)

    self.output_vocab_size = output_vocab_size
    self.decoder_units = decoder_units
    
    self.embedding = embedding_layer
    self.lstm = keras.layers.LSTM(self.decoder_units,return_sequences=True,recurrent_initializer='glorot_uniform')

    self.fc = keras.layers.TimeDistributed(keras.layers.Dense(self.output_vocab_size,kernel_initializer='glorot_uniform'))

  def call(self,input_tokens,input_state):
    input_vectors = self.embedding(input_tokens)
    output_ = self.lstm(input_vectors,initial_state=input_state)
    output = self.fc(output_)
    return output


In [15]:
class NMT(keras.Model):
  def __init__(self, output_vocab_size,encoder_units,decoder_units, encoder_embedding_layer, decoder_embedding_layer, **kwargs):
    super().__init__(**kwargs)
    self.encoder = Encoder(encoder_units, encoder_embedding_layer)
    self.decoder = Decoder(output_vocab_size, decoder_units, decoder_embedding_layer)

  def call(self,input_tokens):
    input_en_tokens,input_hi_tokens = input_tokens
    encoder_output, encoder_final_state = self.encoder(input_en_tokens)
    output = self.decoder(input_hi_tokens, encoder_final_state)
    # print("output shape: ", tf.shape(output))
    return output


In [16]:
class MaskedLoss(tf.keras.losses.Loss):
  def __init__(self,**kwargs):
    super().__init__(**kwargs)
    self.name = 'masked_loss'
    self.loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

  def __call__(self, y_true, y_pred,sample_weight):
    loss = self.loss(y_true, y_pred)
    mask = tf.cast(y_true != 0, tf.float32)
    loss *= mask

    return tf.reduce_mean(tf.reduce_sum(loss,axis = 1))

masked_loss = MaskedLoss()

In [17]:
NMT_model = NMT(VOCAB_SIZE,HIDDEN_UNITS,HIDDEN_UNITS,en_embedding_layer,hi_embedding_layer)

## Training

In [18]:
optimizer = keras.optimizers.Adam(learning_rate=1e-3)
NMT_model.compile(optimizer = optimizer,loss = masked_loss)

In [19]:
Early_Stopping = keras.callbacks.EarlyStopping(patience = 2,min_delta = 0.5,restore_best_weights=True)
Model_Checkpoint = keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/NMT_Models/LSTM.h5',save_best_only=True,save_weights_only=True)


In [21]:
# history_NMT = NMT_model.fit(training_dataset,epochs = 10,validation_data=validation_dataset,callbacks =[Early_Stopping,Model_Checkpoint,keras.callbacks.TerminateOnNaN()])


## Saving Encoder and Decoder for inference



In [22]:
for item in training_dataset.take(1):
  pass


In [23]:
_ = NMT_model(item[0])
NMT_model.load_weights('/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/NMT_Models/LSTM.h5')

In [24]:
encoder = Encoder(HIDDEN_UNITS,en_embedding_layer)
_ = encoder(item[0][0])

decoder = Decoder(VOCAB_SIZE,HIDDEN_UNITS,hi_embedding_layer)
_ = decoder(item[0][1],_[1])

In [25]:
encoder.set_weights(NMT_model.get_weights()[:4])
decoder.set_weights(NMT_model.get_weights()[4:])

In [26]:
class EncoderModel(keras.Model):
  def __init__(self,encoder):
    super().__init__()
    self.encoder = encoder
  
  def call(self,input_tokens):
    return self.encoder(input_tokens)


class DecoderModel(keras.Model):
  def __init__(self,decoder):
    super().__init__()
    self.decoder = decoder
  
  def call(self,input_tokens,input_state):
    return self.decoder(input_tokens,input_state)



In [27]:
Encoder_model = EncoderModel(encoder)
Decoder_model = DecoderModel(decoder)

In [28]:
_ = Encoder_model(item[0][0])
_ = Decoder_model(item[0][1],_[1])

In [None]:
Encoder_model.save('/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/NMT_Models/Encoder_LSTM_tf', save_format = 'tf',)
Decoder_model.save('/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/NMT_Models/Decoder_LSTM_tf', save_format = 'tf',)