In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import string
import re
import os
import random
import pandas as pd
import pickle
import numpy as np
import tensorflow as tf
tf.config.run_functions_eagerly(True)
import tensorflow.keras as keras
from tensorflow.keras import layers

from collections import Counter


In [None]:
processed_data_location = "/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/processed_data"

### Vectorizers for english and hindi text

In [None]:
def load_vectorizer(location):
  import pickle
  from_disk = pickle.load(open(location, "rb"))
  vectorizer = layers.TextVectorization.from_config(from_disk['config'])
  vectorizer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
  vectorizer.set_weights(from_disk['weights'])
  return vectorizer

en_fasttext_vectorizer = load_vectorizer('/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/EN_Fasttext_Vectorizer.pkl')
hi_fasttext_vectorizer = load_vectorizer('/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/HI_Fasttext_Vectorizer.pkl')



## Inference 

Loading saved encoder and decoder

In [None]:
encoder = keras.models.load_model("/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/NMT_Models/Encoder_LSTM_Attention_tf",compile = False)
decoder = keras.models.load_model("/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/NMT_Models/Decoder_LSTM_Attention_tf",compile = False)


Function to decode hindi vectorized tokens to hindi text

In [None]:
hi_voc = hi_fasttext_vectorizer.get_vocabulary()
hi_decoder = {i:voc for i,voc in enumerate(hi_voc)}
def hi_tokens_to_texts(L):
  decoded_text = []
  if(L[-1] != 3):
    L.append(3)
  for l in L:
    decoded_text.append(hi_decoder.get(l))
    if(l == 3):
      break

  return decoded_text

Beam Search

In [None]:
def get_beam_search(K,ignore_repeat = False):
  def beam_search(input_english_text):
    en_tokens = en_fasttext_vectorizer(["<START> " + input_english_text + " <END>"])
    decoder_starting = [2] + [0 for i in range(30)]
    candidates = [(decoder_starting,0.0)]
    encoded_input,states,input_mask = encoder(en_tokens)
    
    complete_sentences = []
    new_candidates = candidates

    input_len = len(input_english_text.split())

    for i in range(min(29,input_len+5)):
      candidates = sorted(new_candidates,key=lambda x : x[1])[:K]
      new_candidates = []
      # print(_)
      # print(candidates)
      for candidate,score in candidates:
        output = decoder(np.array([candidate]),encoded_input,states,input_mask)[0][i].numpy()
        if(ignore_repeat):
          for index in candidate:
            output[index] = 0
        top_K_idx = np.argsort(output)[-K:]
        for token in top_K_idx:
          prob = tf.nn.softmax(output)
          temp_candidate = candidate.copy()
          # print(i,len(temp_candidate))
          if(i+1 == len(temp_candidate)):
            break
          temp_candidate[i+1] = token
          temp_score = score - np.log(prob[token])

          new_candidate = (temp_candidate,temp_score)
          # print(token,new_candidate)
          if(token == 3):
            complete_sentences.append(new_candidate)
          else:
            new_candidates.append(new_candidate)
        
      if(len(complete_sentences) == 2*K):
        break;
      
      if(len(complete_sentences) >= K):
        tokens_list = sorted(complete_sentences,key=lambda x : x[1])[:K]
      else:
        tokens_list = complete_sentences +sorted(complete_sentences,key=lambda x : x[1])[:max(K-len(complete_sentences),0)]

      text_list = [(hi_tokens_to_texts(l[0]),l[1]) for l in tokens_list]
    return text_list
  
  return beam_search

    


### Bleu Score

In [None]:
from nltk.translate.bleu_score import sentence_bleu

beam_search = get_beam_search(3,True)


def get_bleu_score(pairs):
  scores = []
  i = 0
  erorr_list = []
  for pair in pairs:
    reference = [l[0][1:-1] for l in beam_search(pair['en'])]
    candidate = pair['hi'].split()[1:-1]
    try:
      scores.append(sentence_bleu(reference, candidate,weights=[(1.0,),(1.0/2,1.0/2)]))
    except:
      print(i)
      erorr_list.append((i,pair))
      
    i = i + 1
    if(i%50 == 0):
      print(i,np.mean(scores,axis = 0),np.median(scores,axis = 0))
  return np.mean(scores,axis = 0),np.median(scores,axis = 0)

In [None]:
with open(processed_data_location + "/valid_32.pkl",'rb') as f:
  valid = pickle.load(f)

with open(processed_data_location + "/test_32.pkl",'rb') as f:
  test = pickle.load(f)

In [7]:
mean_test_blue_scores,median_test_blue_scores = get_bleu_score(test + valid)
print("mean test bleu-1 and bleu-2 scores: ",mean_test_blue_scores)
print("median test bleu-1 and bleu-2 scores: ",median_test_blue_scores)


mean test bleu-1 and bleu-2 scores:  [0.29873173 0.10060721]
median test bleu-1 and bleu-2 scores:  [0.29634521 0.1004275 ]
