In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Installing huggingface library 'dataset'. English to Hindi Translation data can be downloaded through this library.


In [2]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[K     |████████████████████████████████| 452 kB 5.1 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 56.0 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 66.8 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 53.2 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 61.3 MB/s 
Installing colle

In [3]:
import numpy as np
import string
import re
import os
import random
import pandas as pd
import pickle
import numpy as np
import tensorflow as tf
tf.config.run_functions_eagerly(True)

import tensorflow.keras as keras
from tensorflow.keras import layers

from datasets import load_dataset, load_from_disk
from collections import Counter


In [16]:
raw_data_location = "/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/Raw_data"
processed_data_location = "/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/processed_data"

### Fetch Data

Load and save dataset released by IIT Bombay for english-hindi translation.

In [None]:
dataset = load_dataset("cfilt/iitb-english-hindi")
dataset.save_to_disk(raw_data_location)

In [5]:
dataset = load_from_disk(raw_data_location)



In [6]:
dataset['train']['translation'][:5]

[{'en': 'Give your application an accessibility workout',
  'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'},
 {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'},
 {'en': 'The default plugin layout for the bottom panel',
  'hi': 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका'},
 {'en': 'The default plugin layout for the top panel',
  'hi': 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका'},
 {'en': 'A list of plugins that are disabled by default',
  'hi': 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है'}]

### Preprocessing data

In [8]:
def process_english_text(text):
  text = str(text).lower()
  text = text.replace('\n','')
  text = ''.join(ch for ch in text if ch not in string.punctuation)
  text = re.sub(r'\d+','NUM',text)
  text = re.sub(r'\s+',' ',text)
  text = text.strip()
  return text

def process_hindi_text(text):
  text = str(text).lower()
  text = text.replace('\n','')
  text = ''.join(ch for ch in text if ch not in string.punctuation)
  text = re.sub(r'\d+','NUM',text)
  text = re.sub(r'\s+',' ',text)
  text = text.strip()
  text = re.sub(r'[a-zA-Z]','',text)
  return text

def process_text(en_hi_text_dict):
  en_text = '<START> ' + process_english_text(en_hi_text_dict['en']) + ' <END>'
  hi_text = '<START> ' + process_hindi_text(en_hi_text_dict['hi']) + ' <END>'

  return {'en': en_text,'hi': hi_text}

def process_texts(data):
  processed_data = []
  for en_hi_text_dict in data:
    processed_data.append(process_text(en_hi_text_dict))

  return processed_data

In [9]:
train = process_texts(dataset['train']['translation'])
valid = process_texts(dataset['validation']['translation'])
test = process_texts(dataset['test']['translation'])

In [10]:
del dataset

We need to first decide what length of texts are we going to work on.

In [11]:
def get_lengths(en_hi_texts):
  en_length = []
  hi_length = []

  for en_hi_text in en_hi_texts:
    en = en_hi_text['en']
    hi = en_hi_text['hi']

    en_length.append(len(en.split()))
    hi_length.append(len(hi.split()))
  
  return en_length,hi_length
    
    

In [12]:
en_length,hi_length = get_lengths(train)

In [None]:
round(sum(np.array(en_length) <= 32) / len(en_length)*100,2)

90.47

In [None]:
round(sum(np.array(hi_length) <= 32) / len(hi_length)*100,2)

88.59

In [None]:
round(sum((np.array(en_length) <= 32)*(np.array(hi_length) <= 32)) / len(en_length)*100,2)

87.33

87.3% of the english-hindi pairs have length less than 32 for both english and hindi sentences. Hence, length of 32 is apt for our translation model.

In [17]:
def filter_on_length(en_hi_texts):
  filtered_en_hi_texts = []
  for en_hi_text in en_hi_texts:
    if(len(en_hi_text['en'].split()) <= 32):
      if(len(en_hi_text['hi'].split()) <= 32):
        filtered_en_hi_texts.append(en_hi_text)
  
  return filtered_en_hi_texts

In [18]:
train = filter_on_length(train)
valid = filter_on_length(valid)
test = filter_on_length(test)


Saving final english-hindi pairs with length less than or equal to 32 for both english and hindi sentences.

In [19]:
with open(processed_data_location + "/train.pkl",'wb') as f:
  pickle.dump(train,f)
with open(processed_data_location + "/valid.pkl",'wb') as f:
  pickle.dump(valid,f)
with open(processed_data_location + "/test.pkl",'wb') as f:
  pickle.dump(test,f)


In [20]:
len(train),len(valid),len(test)

(1448877, 465, 1942)

In [21]:
del train,valid,test

### Vectorizer and Embedding

In [22]:
VOCAB_SIZE = 2e5
OOV_TOKEN = 1
START_TOKEN = 2
END_TOKEN = 3
MAX_LENGTH = 32

Using Fasttext vectors for english and hindi words.

In [13]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "/content/drive/MyDrive/MachineLearning/fasttext_en_cc"

!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz -P "/content/drive/MyDrive/MachineLearning/fasttext_hi"

In [23]:
!unzip -q "/content/drive/MyDrive/MachineLearning/fasttext_en_cc/crawl-300d-2M.vec.zip"
!gunzip -q "/content/drive/MyDrive/MachineLearning/fasttext_hi/cc.hi.300.vec.gz"

In [24]:
import io
from zipfile import ZipFile
import numpy as np

def load_vectors(fname,VOCAB_SIZE):
  fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
  n, d = map(int, fin.readline().split())
  data = {}
  i = 0
  for line in fin:
      tokens = line.rstrip().split(' ')
      word,vector = tokens[0].lower(), [float(t) for t in tokens[1:]]
      if word not in data:
        data[word] = [vector]
        i = i + 1
      else:
        data[word].append(vector)
      if (i == VOCAB_SIZE):
        break
  for word in data:
    data[word] = np.mean(data[word],axis = 0)
  return data

In [25]:
en_vectors = load_vectors('/content/crawl-300d-2M.vec',VOCAB_SIZE)
hi_vectors = load_vectors('/content/drive/MyDrive/MachineLearning/fasttext_hi/cc.hi.300.vec',VOCAB_SIZE)

In [26]:
en_vocab = list(en_vectors.keys())
hi_vocab = list(hi_vectors.keys())

en_vocab = ["","[UNK]","<START>","<END>"] + [word for word in en_vocab if word not in ["","[UNK]","<START>","<END>"]][:-4]
hi_vocab = ["","[UNK]","<START>","<END>"] + [word for word in hi_vocab if word not in ["","[UNK]","<START>","<END>"]][:-4]

preparing vectorizer layer

In [27]:
en_fasttext_vectorizer = layers.TextVectorization(standardize = None,
    output_mode='int',output_sequence_length = MAX_LENGTH)

hi_fasttext_vectorizer = layers.TextVectorization(standardize = None,
    output_mode='int',output_sequence_length = MAX_LENGTH)


In [28]:
en_fasttext_vectorizer.set_vocabulary(np.array(en_vocab))
hi_fasttext_vectorizer.set_vocabulary(np.array(hi_vocab))


In [29]:
import pickle
pickle.dump({'config': en_fasttext_vectorizer.get_config(),
             'weights': en_fasttext_vectorizer.get_weights()}
            , open("/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/EN_Fasttext_Vectorizer.pkl", "wb"))

import pickle
pickle.dump({'config': hi_fasttext_vectorizer.get_config(),
             'weights': hi_fasttext_vectorizer.get_weights()}
            , open("/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/HI_Fasttext_Vectorizer.pkl", "wb"))


preparing embedding layer for english and hindi text

In [30]:
def get_embedding_layer(vectorizer,embeddings):
  voc = vectorizer.get_vocabulary()
  word_index = dict(zip(voc, range(len(voc))))

  OOV_vector = embeddings.get(voc[-1])
  START_vector = embeddings.get(voc[-2])
  END_vector = embeddings.get(voc[-3])

  num_tokens = len(voc) + 2 
  embedding_dim = 300
  hits = 0
  misses = 0
  missed_words = []
  embedding_matrix = np.zeros((num_tokens, embedding_dim))
  for word, i in word_index.items():
      embedding_vector = embeddings.get(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector
          hits += 1
      else:
          missed_words.append(word)
          misses += 1
  print("Converted %d words (%d misses)" % (hits, misses))

  embedding_matrix[OOV_TOKEN] = OOV_vector
  embedding_matrix[START_TOKEN] = START_vector
  embedding_matrix[END_TOKEN] = END_vector


  embedding_layer = layers.Embedding(
      num_tokens,
      embedding_dim,
      embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
      trainable=False,
      name = 'embedding_layer_300'
  )

  return embedding_layer



In [31]:
en_embedding_layer = get_embedding_layer(en_fasttext_vectorizer,en_vectors)

Converted 199996 words (4 misses)


In [40]:
en_embedding_layer = get_embedding_layer(en_fasttext_vectorizer,en_vectors)

import pickle
pickle.dump({'config': en_embedding_layer.get_config(),
             'weights': en_embedding_layer.get_weights()}
            , open("/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/en_fasttext_embedding_layer.pkl", "wb"))
   

Converted 199996 words (4 misses)


In [41]:
hi_embedding_layer = get_embedding_layer(hi_fasttext_vectorizer,hi_vectors)

import pickle
pickle.dump({'config': hi_embedding_layer.get_config(),
             'weights': hi_embedding_layer.get_weights()}
            , open("/content/drive/MyDrive/MachineLearning/NMT_English_to_Hindi/hi_fasttext_embedding_layer.pkl", "wb"))



Converted 199996 words (4 misses)
