In [2]:
import os
import string
import numpy as np
import pandas as pd
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
import logging
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
logging.getLogger('tensorflow').setLevel(logging.FATAL)
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import unicodedata
import io
import time
import warnings
import sys

# Prepare the data
1. Convert the texts into a DataFrame with English and Hindi columns
2. Storing it Google-Drive 

In [3]:
# function to read raw text file
def read_text(filename):
        # open the file
        file = open(filename, mode='rt', encoding='utf-8')
        
        # read all text
        text = file.read()
        file.close()
        return text

In [4]:
# split a text into sentences
def to_lines(text):
      sents = text.strip().split('\n')
      sents = [i.split('\t') for i in sents]
      return sents

In [5]:
# mount the G-Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# read the files
en = read_text('/content/drive/MyDrive/kroop_data/train01.en') # path for english texts
en = to_lines(en) 
hd = read_text('/content/drive/MyDrive/kroop_data/train01.hi') # path for hindi texts
hd = to_lines(hd)

# covert the text files into DataFrame
hd_df = pd.DataFrame(hd, columns = ['hd'])
en_df = pd.DataFrame(en, columns = ['en'])
df = pd.concat([en_df, hd_df], axis = 1)

# save the DataFrame as csv 
df.to_csv('data.csv')
PATH = '/content/data.csv'

# Preprocess English and Hindi sentences

In [7]:
# converts unicode to ASCII
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

# preprocess english sentence
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.rstrip().strip()
    return w

# preprocess hindi sentence
def hindi_preprocess_sentence(w):
    w = w.rstrip().strip()
    return w

def max_length(tensor):
    return max(len(t) for t in tensor)

In [8]:
# clean the data
def create_dataset(path=PATH):
    lines=pd.read_csv(path,encoding='utf-8')
    lines=lines.dropna()
    en = []
    hd = []
    for i, j in zip(lines['en'], lines['hd']):
        en_1 = [preprocess_sentence(w) for w in i.split(' ')]
        en_1.append('<end>')
        en_1.insert(0, '<start>')
        hd_1 = [hindi_preprocess_sentence(w) for w in j.split(' ')]
        hd_1.append('<end>')
        hd_1.insert(0, '<start>')
        en.append(en_1)
        hd.append(hd_1)
    return hd, en

In [9]:
# get the maximum length tensor
def max_length(tensor):
    return max(len(t) for t in tensor)

# Tokenization of Data

In [10]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)
  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
  return tensor, lang_tokenizer

In [11]:
# load the dataset in the required format
def load_dataset(path=PATH):
    targ_lang, inp_lang = create_dataset(path)
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [12]:
# use the above functions
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(PATH)
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

# Create Train and Test Data

In [13]:
# split the train and validation set
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

1912 1912 478 478


In [14]:
# check the tokenized words
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))
    
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
2 ----> <start>
25 ----> we
1080 ----> asserted
19 ----> our
5310 ----> supremacy
7 ----> in
1 ----> the
59 ----> world
29 ----> by
5311 ----> introducing
5312 ----> tejas
5313 ----> aircraft .
3 ----> <end>

Target Language; index to word mapping
1 ----> <start>
31 ----> हम
5307 ----> ‘तेजस’
584 ----> हवाई
5308 ----> जहाज
3 ----> के
74 ----> द्वारा
40 ----> आज
103 ----> दुनिया
3 ----> के
1163 ----> अंदर
73 ----> अपनी
2338 ----> अहमियत
1164 ----> पहुंचा
47 ----> रहे
21 ----> हैं।
2 ----> <end>


# Create Dataset

In [15]:
# define parameters
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 16
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 128
units = 256
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

# Encoder Decoder with Attention Model
Here we use a sequence to sequence model which has two parts – an encoder and a decoder. Both the parts have two different neural network models combined into one giant network. The input is put through an encoder model which gives us the encoder output. Here, each input words is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence. 

Encoder

In [16]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

Attention Mechanism

In [17]:
class Attention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(Attention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    hidden_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

Decoder

In [18]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)
    self.attention = Attention(self.dec_units)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)
    return x, state, attention_weights

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

Optimizer

In [19]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

# get the mean loss
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

In [20]:
# define the checkpoint
checkpoint_dir = '/content/drive/MyDrive/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# Training

1. Pass input through encoder to get encoder output.
2. Then encoder output, encoder hidden state and the decoder input is passed to decoder.
3. Decoder returns predictions and decoder hidden state.
4. Decoder hidden state is then passed back to model.
5. Predictions are used to calculate loss.
6. Use teacher forcing (technique where the target word is passed as the next input) for the next input to the decoder.
7. Calculate gradients and apply it to optimizer for backpropogation.

In [21]:
# steps to be followed while training
def train_step(inp, targ, enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
    # Teacher forcing
    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))      
  return batch_loss

In [22]:
# training
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     batch_loss.numpy()))
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.0557
Epoch 1 Batch 100 Loss 1.3809
Epoch 1 Loss 1.6452
Time taken for 1 epoch 338.72101354599 sec

Epoch 2 Batch 0 Loss 1.3608
Epoch 2 Batch 100 Loss 1.5045
Epoch 2 Loss 1.4912
Time taken for 1 epoch 262.8338351249695 sec

Epoch 3 Batch 0 Loss 1.6090
Epoch 3 Batch 100 Loss 1.4806
Epoch 3 Loss 1.4352
Time taken for 1 epoch 257.74615454673767 sec

Epoch 4 Batch 0 Loss 1.2369
Epoch 4 Batch 100 Loss 1.3371
Epoch 4 Loss 1.3864
Time taken for 1 epoch 266.54276061058044 sec

Epoch 5 Batch 0 Loss 1.1854
Epoch 5 Batch 100 Loss 1.5004
Epoch 5 Loss 1.3418
Time taken for 1 epoch 260.319144487381 sec

Epoch 6 Batch 0 Loss 1.3573
Epoch 6 Batch 100 Loss 1.1337
Epoch 6 Loss 1.2953
Time taken for 1 epoch 260.05254340171814 sec

Epoch 7 Batch 0 Loss 1.4602
Epoch 7 Batch 100 Loss 1.1731
Epoch 7 Loss 1.2474
Time taken for 1 epoch 260.34270095825195 sec

Epoch 8 Batch 0 Loss 1.0988
Epoch 8 Batch 100 Loss 0.9545
Epoch 8 Loss 1.1966
Time taken for 1 epoch 261.8207848072052 sec

Epoch 9

# Testing

In [24]:
# predict the translated sentence
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = preprocess_sentence(sentence)
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_inp,
                                                           padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang.index_word[predicted_id] + ' '
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence

In [25]:
# to print the translated sentence
def translate(sentence):
    result, sentence = evaluate(sentence)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [26]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f9eeb39cb10>

Examples for prediction

In [39]:
translate(u'prime minister has a scheme for women')

Input: prime minister has a scheme for women
Predicted translation: प्रधानमंत्री ने कहा कि भारत और एक बात है। <end> 


In [45]:
translate(u'When did India get independent')

Input: when did india get independent
Predicted translation: मैं भारत के लिए एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार 


In [48]:
translate(u'Prime Minister addressed a public gathering')

Input: prime minister addressed a public gathering
Predicted translation: प्रधानमंत्री ने कहा कि भारत के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने के लिए एक बात करने 


In [42]:
translate(u'India is a democratic country')

Input: india is a democratic country
Predicted translation: मैं भारत में भी एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार 


In [47]:
translate(u'India is a democratic country')

Input: india is a democratic country
Predicted translation: मैं भारत में भी एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार एक बार 
