In [None]:
BATCH_SIZE = 64
HIDDEN_UNITS = 1024

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import string
import re
import os
import random
import pandas as pd
import pickle
import numpy as np
import tensorflow as tf
tf.config.run_functions_eagerly(True)
import tensorflow.keras as keras
from tensorflow.keras import layers

from collections import Counter


In [None]:
processed_data_location = "/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/processed_data"

## Preprocessed data, vectorizers and embedding layers for english and hindi text

In [None]:
with open(processed_data_location + "/train_32.pkl",'rb') as f:
  train = pickle.load(f)


In [None]:
random.seed(10)
random.shuffle(train)
train_size = len(train)
train, valid = train[:int(train_size*0.9)],train[int(train_size*0.9):]


In [None]:
train_size = len(train)

In [None]:
def load_vectorizer(location):
  import pickle
  from_disk = pickle.load(open(location, "rb"))
  vectorizer = layers.TextVectorization.from_config(from_disk['config'])
  vectorizer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
  vectorizer.set_weights(from_disk['weights'])
  return vectorizer

en_fasttext_vectorizer = load_vectorizer('/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/EN_Fasttext_Vectorizer.pkl')
hi_fasttext_vectorizer = load_vectorizer('/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/HI_Fasttext_Vectorizer.pkl')



In [None]:
def load_embedding(location):
  import pickle
  from_disk = pickle.load(open(location, "rb"))
  embedding_layer = layers.Embedding.from_config(from_disk['config'])

  return embedding_layer

en_embedding_layer = load_embedding("/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/en_fasttext_embedding_layer.pkl")
hi_embedding_layer = load_embedding("/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/hi_fasttext_embedding_layer.pkl")


## Tensorflow dataset

In [None]:
def create_tf_dataset(data):
  data = [(text['en'],text['hi']) for text in data]
  tf_data = tf.data.Dataset.from_tensor_slices(data)
  tf_data = tf_data.shuffle(BATCH_SIZE*4).batch(BATCH_SIZE).map(lambda X: (en_fasttext_vectorizer(X[:,0]),hi_fasttext_vectorizer(X[:,1])))
  tf_data = tf_data.map(lambda X_batch_en,X_batch_hi: ((X_batch_en,X_batch_hi[:,:-1]),X_batch_hi[:,1:]) )
  tf_data = tf_data.prefetch(4)

  return tf_data

In [None]:
training_dataset = create_tf_dataset(train)
validation_dataset = create_tf_dataset(valid)




In [None]:
del train,valid

## Model

In [None]:
MAX_TOKENS = 32
N_LAYERS = 3 
D_MODEL = 300
DFF_UNITS = 512
VOCAB_SIZE = hi_fasttext_vectorizer.vocabulary_size()
DROPOUT = 0.2
N_HEADS = 10


In [None]:
class PositionalEncoding(keras.layers.Layer):
    def __init__(self,max_steps,max_dims,dtype=tf.float32,**kwargs):
      super().__init__(dtype=dtype,**kwargs)
      self.max_steps = max_steps
      self.max_dims = max_dims
      # self.dtype = dtype
      assert self.max_dims % 2 == 0

      p,i = np.meshgrid(np.arange(self.max_steps),np.arange(self.max_dims // 2))

      pos_embedding = np.zeros((1,self.max_steps,self.max_dims))
      pos_embedding[0,:,::2] = np.sin(p / 10000**(2 * i/ self.max_dims)).T
      pos_embedding[0,:, 1::2] = np.cos(p / 10000**(2 * i/ self.max_dims)).T
      
      self.pos_embedding = tf.constant(pos_embedding.astype(self.dtype))

    def call(self,inputs):
      shape = tf.shape(inputs)
      return inputs + self.pos_embedding[:,:shape[-2],:shape[-1]]

def Dot_Product_Attention(Q,K,V=None,Mask = None):

  if V is None:
    V = K
  
  qk = tf.matmul(Q,K,transpose_b = True)
  dim_k = tf.cast(tf.shape(K)[-1], tf.float32)

  scaled_qk = qk / tf.math.sqrt(dim_k)

  if Mask is not None:
    scaled_qk = scaled_qk + Mask * -1e9
  
  attention_weights = tf.nn.softmax(scaled_qk,axis = -1)

  output = tf.matmul(attention_weights,V)

  return output, attention_weights


class MultiHeadAttention(keras.layers.Layer):
  def __init__(self, d_model, n_heads,**kwargs):
    super().__init__(**kwargs)
    assert d_model % n_heads == 0

    self.n_heads = n_heads
    self.d_model = d_model

    self.depth = self.d_model // self.n_heads

    self.wq = keras.layers.Dense(self.d_model,kernel_initializer='glorot_uniform')
    self.wk = keras.layers.Dense(self.d_model,kernel_initializer='glorot_uniform')
    self.wv = keras.layers.Dense(self.d_model,kernel_initializer='glorot_uniform')
    
    self.dense = keras.layers.Dense(self.d_model,kernel_initializer='glorot_uniform')
    

  def split_heads(self, x):
    # print(x.shape)
    # assert tf.rank(x) == 3
    batch_size = tf.cast(tf.shape(x)[0],tf.int32)
    x = tf.reshape(x, (batch_size,-1,self.n_heads,self.depth))
    x = tf.transpose(x,[0,2,1,3])

    return x

  def call(self,Q,K,V = None,Mask = None):
    batch_size = tf.shape(Q)[0]
    
    if V is None:
      V = K

    Q = self.wq(Q)
    K = self.wk(K)
    V = self.wv(V)

    Q = self.split_heads(Q)
    K = self.split_heads(K)
    V = self.split_heads(V)

    attention, attention_weights = Dot_Product_Attention(Q,K,V,Mask)
    attention = tf.transpose(attention,[0,2,1,3])
    attention = tf.reshape(attention,(batch_size,-1,self.d_model))

    output = self.dense(attention)

    return output, attention_weights
    
class FeedForwardsNetwork(keras.layers.Layer):
  def __init__(self,d_model,dff,dropout,**kwargs):
    super().__init__(**kwargs)
    self.d_model = d_model
    self.dff = dff
    self.dropout = dropout
    self.layer1 = keras.layers.Dense(dff,activation = 'relu',kernel_initializer='glorot_uniform')
    self.layer2 = keras.layers.Dense(d_model,kernel_initializer='glorot_uniform')
    self.layer3 = keras.layers.Dropout(dropout)
    
  def call(self,x):
    y1 = self.layer1(x)
    y2 = self.layer2(y1)
    output = self.layer3(y2)

    return output

class EncoderLayer(keras.layers.Layer):
  def __init__(self, d_model,n_heads,dff,dropout,**kwargs):
    super().__init__(**kwargs)
    
    self.mha = MultiHeadAttention(d_model = d_model, n_heads = n_heads)
    self.ffn = FeedForwardsNetwork(d_model = d_model, dff = dff,dropout = dropout)

    self.LayerNorm1 = keras.layers.Normalization()
    self.LayerNorm2 = keras.layers.Normalization()

  def call(self,x, Mask):
    attention_output, _ = self.mha(x,x,x, Mask)
    out1 = self.LayerNorm1(x + attention_output)

    ffn_output = self.ffn(out1)
    out2 = self.LayerNorm2(out1 + ffn_output)

    return out2

class DecoderLayer(keras.layers.Layer):
  def __init__(self, d_model,n_heads,dff,dropout,**kwargs):
    super().__init__(**kwargs)
    self.d_model = d_model
    self.n_heads = n_heads
    self.dff = dff
    self.dropout = dropout

    self.mha1 = MultiHeadAttention(d_model = self.d_model, n_heads = self.n_heads)
    self.mha2 = MultiHeadAttention(d_model = self.d_model, n_heads = self.n_heads)
    self.ffn = FeedForwardsNetwork(d_model = self.d_model, dff = self.dff,dropout = self.dropout)

    self.LayerNorm1 = keras.layers.Normalization()
    self.LayerNorm2 = keras.layers.Normalization()
    self.LayerNorm3 = keras.layers.Normalization()

  def call(self,x,encoder_output,look_ahead_mask,padding_mask):

    attention_out_1,_ = self.mha1(x,x,x,look_ahead_mask)
    out1 = self.LayerNorm1(x + attention_out_1)

    attention_out_2,_ = self.mha2(out1,encoder_output,encoder_output,padding_mask)
    out2 = self.LayerNorm2(out1 + attention_out_2)

    feedforward_out = self.ffn(out2)
    out3 = self.LayerNorm3(out2 + feedforward_out)

    return out3,attention_out_1,attention_out_2

    


In [None]:
class Encoder(keras.layers.Layer):
  def __init__(self,d_model,n_layers,n_heads,dff,dropout,vocab_size,en_embedding_layer,**kwargs):
    super().__init__(**kwargs)

    self.d_model = d_model
    self.n_layers = n_layers

    self.emb = en_embedding_layer
    self.pos_emb = PositionalEncoding(MAX_TOKENS,d_model)

    self.encoder_layers = [EncoderLayer(d_model = d_model,n_heads = n_heads,dff = dff,dropout = dropout) for i in range(self.n_layers)]

  def call(self,x,mask):
    embeddings = self.emb(x)
    embeddings = self.pos_emb(embeddings)

    output = embeddings
    for encoder_layer in self.encoder_layers:
      output = encoder_layer(output,mask)
    
    return output

class Decoder(keras.layers.Layer):
  def __init__(self,d_model,n_layers,n_heads,dff,dropout,vocab_size,hi_embedding_layer,**kwargs):
    super().__init__(**kwargs)

    self.d_model = d_model
    self.n_layers = n_layers

    self.emb = hi_embedding_layer
    self.pos_emb = PositionalEncoding(MAX_TOKENS,d_model)

    self.decoder_layers = [DecoderLayer(d_model,n_heads,dff,dropout) for i in range(self.n_layers)]

  def call(self,x,encoder_output,look_ahead_mask,padding_mask):
    embeddings = self.emb(x)
    embeddings = self.pos_emb(embeddings)
    attention_weights = {}
    output = embeddings
    for i,decoder_layer in enumerate(self.decoder_layers):
      output,block1,block2 = decoder_layer(output,encoder_output,look_ahead_mask,padding_mask)

      attention_weights[f'decoder_layer{i+1}_block1'] = block1
      attention_weights[f'decoder_layer{i+1}_block2'] = block2


    return output,attention_weights

@tf.function
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

@tf.function
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)




In [None]:

class Transformer(keras.models.Model):
  def __init__(self,n_layers,d_model,n_heads,dff,vocab_size,dropout,en_embedding_layer,hi_embedding_layer,**kwargs):
    super().__init__(**kwargs)
    
    self.n_layers = n_layers
    self.d_model = d_model
    self.n_heads = n_heads
    self.dff = dff
    self.vocab_size = vocab_size
    self.dropout = dropout

    self.encoder = Encoder(d_model = d_model,n_layers = n_layers,n_heads = n_heads,dff = dff,dropout = dropout,vocab_size = vocab_size,en_embedding_layer = hi_embedding_layer)
    self.decoder = Decoder(d_model = d_model,n_layers = n_layers,n_heads = n_heads,dff = dff,dropout = dropout,vocab_size = vocab_size,hi_embedding_layer = en_embedding_layer)
    
    self.final_layer = tf.keras.layers.Dense(vocab_size,kernel_initializer='glorot_uniform')

  def create_masks(self,inp,tar):

    padding_mask = create_padding_mask(inp)
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    padding_mask_target = create_padding_mask(tar)

    look_ahead_mask = tf.maximum(look_ahead_mask,padding_mask_target)

    return padding_mask,look_ahead_mask
  
  def call(self,input):
    inp,tar = input
    padding_mask,look_ahead_mask = self.create_masks(inp,tar)

    encoder_output = self.encoder(inp,padding_mask)

    decoder_output,attention_weights = self.decoder(tar,encoder_output,look_ahead_mask,padding_mask)

    final_output = self.final_layer(decoder_output)

    return final_output
    

In [None]:
n_layers = N_LAYERS
d_model = D_MODEL
dff = DFF_UNITS
vocab_size = VOCAB_SIZE
dropout = DROPOUT
n_heads = N_HEADS

transformer = Transformer(n_layers=n_layers, d_model=d_model, n_heads=n_heads, dff=dff,vocab_size=vocab_size,dropout = dropout,en_embedding_layer = en_embedding_layer,hi_embedding_layer = hi_embedding_layer)


In [None]:
class MaskedLoss(tf.keras.losses.Loss):
  def __init__(self,**kwargs):
    super().__init__(**kwargs)
    self.name = 'masked_loss'
    self.loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

  def __call__(self, y_true, y_pred,sample_weight):
    # y_pred = y_pred[0]
    # print("pred shape: ",tf.shape(y_pred))
    # print("true shape: ",tf.shape(y_true))
    loss = self.loss(y_true, y_pred)
    mask = tf.cast(y_true != 0, tf.float32)
    loss *= mask

    return tf.reduce_mean(tf.reduce_sum(loss,axis = 1))

masked_loss = MaskedLoss()



## Training

In [None]:
Early_Stopping = keras.callbacks.EarlyStopping(patience = 3,min_delta = 1,restore_best_weights=True)
Model_Checkpoint = keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/NMT_Models/Transformer.h5',save_best_only=True,save_weights_only=True)

optimizer = keras.optimizers.Adam(5e-5)
transformer.compile(optimizer = optimizer,loss = masked_loss)


In [1]:
history_NMT = transformer.fit(training_dataset.repeat(),steps_per_epoch=train_size//(BATCH_SIZE*5),epochs = 15,validation_data=validation_dataset,callbacks =[Early_Stopping,Model_Checkpoint,keras.callbacks.TerminateOnNaN()])
