In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import unidecode
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense,GRU,Bidirectional,Embedding
from sklearn.model_selection import train_test_split

%matplotlib inline
import matplotlib.pyplot as plt

# <font color='red'>TRAINING</font>

In [40]:
DATA_PATH='data.csv'
CHECKPOINT_PATH='training_checkpoints'

MAX_WORDS=14
SAMPLES=50000
VOCAB=15000
BATCH_SIZE=128
EPOCH=10

## Load Training data

In [27]:
train=pd.read_csv(DATA_PATH)
train=train.drop("source",axis=1)
train=train.dropna() #drop NA rows
train.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [28]:
print("Total Samples: ",len(train))

Total Samples:  127605


## Count length of english and hindi sentences

In [29]:
train['english_sentence_length'] = train['english_sentence'].apply(lambda x: len(x.split()))
train['hindi_sentence_length'] = train['hindi_sentence'].apply(lambda x: len(x.split()))
train.head()

Unnamed: 0,english_sentence,hindi_sentence,english_sentence_length,hindi_sentence_length
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर...",12,14
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...,9,11
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।,10,9
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते,12,11
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।,9,8


## 👇1 ----> 3212 means it has 3212 english sentences of length=1

In [34]:
train['english_sentence_length'].value_counts().sort_index()[:MAX_WORDS]

1     3212
2     3730
3     3862
4     4981
5     6090
6     6912
7     7214
8     7184
9     6714
10    6132
11    5608
12    5047
13    4338
14    3863
Name: english_sentence_length, dtype: int64

In [35]:
x=train['english_sentence_length'].value_counts().sort_index()[:MAX_WORDS].sum()

## 👇Keep rows if length of english sentence >= 4

In [16]:
train=train.sort_values(by='english_sentence_length')
train=train.iloc[3212+3730+3862:x]
train.head()

Unnamed: 0,english_sentence,hindi_sentence,english_sentence_length,hindi_sentence_length
43345,love and pleasure.,और आनंद चाहिए |,3,4
104364,Arm chair position,आराम कुर्सी पोजीशन,3,3
43261,"In October 2010,","अक्टूबर २०१० की बात है,",3,5
84878,Jaishanker prasad((In individuality),जयशंकर प्रसाद (अभिव्यक्ति में),3,4
20752,"of deconstructing, redefining,","शुरू करनी होगी,",3,3


## 👇1 ----> 3595 means it has 3595 hindi sentences of length=1

In [36]:
train['hindi_sentence_length'].value_counts().sort_index()[:MAX_WORDS]

1     3595
2     3367
3     3123
4     3718
5     4595
6     5658
7     6184
8     6323
9     6374
10    6134
11    5509
12    5214
13    4673
14    4112
Name: hindi_sentence_length, dtype: int64

In [37]:
x=train['hindi_sentence_length'].value_counts().sort_index()[:MAX_WORDS].sum()

## 👇Keep rows if length of hindi sentence >= 4

In [38]:
train=train.sort_values(by='hindi_sentence_length')
train=train.iloc[3595+3367+3123:x]
train.head()

Unnamed: 0,english_sentence,hindi_sentence,english_sentence_length,hindi_sentence_length
115576,Dainik Jagaran Number 1.,दैनिक जागरण नम्बर १,4,4
13750,Sick or disabled,बीमार या फिर विकलांग,3,4
59202,It is written as follows:,वह अभिलेख निम्नलिखित है:,5,4
34770,The contribution of mahadevi verma,महादेवी वर्मा का योगदान,5,4
5164,integrating wireless networking,जीपीएस और जीएसएम को,3,4


In [39]:
print("Total samples: ",len(train))

Total samples:  58494


## Randomly sample and keep SAMPLES=50000 rows

In [41]:
train=train.sample(SAMPLES)
train.head()

Unnamed: 0,english_sentence,hindi_sentence,english_sentence_length,hindi_sentence_length
87326,mumbai team is going to Rangy trophy on behalf...,मुंबई क्रिकेट टीम रणजी ट्रॉफी में शहर का प्रति...,12,11
117649,"from the punishments of my cancer,","मेरी कैंसर के दंड से,",6,5
37737,I learned this firsthand with my next adventure.,यह मैंने खुद अपने अगले अनुभव से सीखा.,8,8
90347,It's a great exercise,यह एक महान प्रयोग है,4,5
119006,complexity in visual language,जटिलता होती है दृश्य भाषा की,4,6


## Add <start\>  and <end\> token

In [42]:
train['english_sentence'] = train['english_sentence'].apply(lambda x: '<start> '+str(' '.join(x.split()[:MAX_WORDS]))+' <end>')
train['hindi_sentence'] = train['hindi_sentence'].apply(lambda x: '<start> '+str(' '.join(x.split()[:MAX_WORDS]))+' <end>')
train.head()

Unnamed: 0,english_sentence,hindi_sentence,english_sentence_length,hindi_sentence_length
87326,<start> mumbai team is going to Rangy trophy o...,<start> मुंबई क्रिकेट टीम रणजी ट्रॉफी में शहर ...,12,11
117649,"<start> from the punishments of my cancer, <end>","<start> मेरी कैंसर के दंड से, <end>",6,5
37737,<start> I learned this firsthand with my next ...,<start> यह मैंने खुद अपने अगले अनुभव से सीखा. ...,8,8
90347,<start> It's a great exercise <end>,<start> यह एक महान प्रयोग है <end>,4,5
119006,<start> complexity in visual language <end>,<start> जटिलता होती है दृश्य भाषा की <end>,4,6


In [43]:
MAX_WORDS=MAX_WORDS+2 #add 2 for <start>,<end> token

## Tokenize and pad english sentence

In [44]:
tokenizerE = Tokenizer(num_words=VOCAB, 
                       oov_token='<OOV>', 
                       lower=True, 
                       filters='#$!"%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

tokenizerE.fit_on_texts(train['english_sentence'])

eng_inp = tokenizerE.texts_to_sequences(train['english_sentence'])

eng_inp = pad_sequences(eng_inp,
                        maxlen=MAX_WORDS,
                        truncating='post',
                        padding='post',
                        dtype='float32')

## Tokenize and pad hindi sentence

In [45]:
tokenizerH = Tokenizer(num_words=VOCAB,
                       oov_token='<OOV>',
                       lower=True,
                       filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

tokenizerH.fit_on_texts(train['hindi_sentence'])

hin_inp=tokenizerH.texts_to_sequences(train['hindi_sentence'])

hin_inp=pad_sequences(hin_inp,
                      maxlen=MAX_WORDS,
                      truncating='post',
                      padding='post',
                      dtype='float32')

## Creating reverse hindi dictionary

In [46]:
rev_hin_dict = dict(map(reversed, tokenizerH.word_index.items()))

## Creating datatset

In [47]:
x_train, x_test, y_train, y_test = train_test_split(eng_inp, hin_inp, test_size=0.1, random_state=42)

In [48]:
dataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(BATCH_SIZE,drop_remainder=True)
dataset_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(BATCH_SIZE,drop_remainder=True)

## Encoder

In [49]:
class Encoder(tf.keras.Model):    
    def __init__(self,vocab,batch_size):
        super(Encoder,self).__init__()  
        self.vocab=vocab
        self.batch_size=batch_size
        self.embed=Embedding(self.vocab,256)
        self.gru=Bidirectional(GRU(256,return_state=True,return_sequences=True,
                                   recurrent_initializer='glorot_uniform',dropout=0.5))
        self.gru1=Bidirectional(GRU(256,return_state=True,return_sequences=True,
                                   recurrent_initializer='glorot_uniform',dropout=0.5))
    
    def call(self,encoder_inp,hidden):
        encoder_inp=self.embed(encoder_inp)       
        _,state_htmp,state_ctmp=self.gru(encoder_inp,initial_state=hidden)
        encoder_out,state_h,state_c=self.gru1(encoder_inp,initial_state=[state_htmp,state_ctmp])
        return encoder_out,tf.concat([state_h,state_c],axis=1)
    
    def initialise_hidden_unit(self):
        return [tf.zeros((self.batch_size,256)) for i in range(2)]

In [50]:
encoder=Encoder(VOCAB,BATCH_SIZE)

## Decoder

In [51]:
class Decoder(tf.keras.Model):   
    def __init__(self,vocab):
        super(Decoder,self).__init__()
        self.vocab=vocab
        self.embed=Embedding(self.vocab,256)
        self.dense=Dense(512)
        self.dense1=Dense(512)
        self.dense2=Dense(1)
        self.gru=GRU(512,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform',
                     dropout=0.5)
        self.dense3=Dense(self.vocab)
        
    def call(self,decoder_inp,encoder_out,carry):       
        decoder_inp=self.embed(decoder_inp)
        carry=tf.expand_dims(carry,1)
#----------------------------------------------------------------
        #attention
        score=self.dense2(tf.math.tanh(self.dense1(encoder_out)+self.dense(carry)))
        attention_weights=tf.nn.softmax(score,axis=1)
        context_vector=tf.math.reduce_sum(attention_weights*encoder_out,axis=1,keepdims=True)
        merged_vector=tf.concat([context_vector,decoder_inp],axis=-1)
#-----------------------------------------------------------------        
        decoder_out,decoder_state=self.gru(merged_vector)
        
        decoder_out=tf.reshape(decoder_out,(-1,decoder_out.shape[2]))
        decoder_out=self.dense3(decoder_out)
        return decoder_out,decoder_state,attention_weights

In [52]:
decoder=Decoder(VOCAB)

## Optimizer and loss function

In [53]:
optimizer=tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

## Create checkpoint

In [54]:
import os
checkpoint_dir = CHECKPOINT_PATH
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

## Training function

In [55]:
@tf.function
def train(inp,out,hidden,MAX_WORDS):
    loss=0
    with tf.GradientTape() as tape:
        eo,hidden=encoder(inp,hidden)
        h=hidden       
        hi=tf.expand_dims([tokenizerH.word_index['<start>']] * BATCH_SIZE, 1)
        hi=tf.cast(hi,'float32')        
        for i in range (1,MAX_WORDS):           
            do,ds,_=decoder(hi,eo,h) 
            loss+=loss_function(out[:, i], do)
            hi=tf.expand_dims(out[:, i], 1) 
            hi=tf.cast(hi,'float32')    
    variables = encoder.trainable_variables+decoder.trainable_variables
    gradients=tape.gradient(loss, variables) 
    optimizer.apply_gradients(zip(gradients, variables))
    batch_loss = loss / MAX_WORDS
    return batch_loss

## Testing function

In [56]:
@tf.function
def test(inp_,out_,hidden_,MAX_WORDS):
    loss_=0
    eo_,hidden_=encoder(inp_,hidden_)
    h_=hidden_       
    hi_=tf.expand_dims([tokenizerH.word_index['<start>']] * BATCH_SIZE, 1)
    hi_=tf.cast(hi_,'float32')        
    for i in range (1,MAX_WORDS):           
        do_,ds_,_=decoder(hi_,eo_,h_) 
        loss_+=loss_function(out_[:, i], do_)
        hi_=tf.expand_dims(out_[:, i], 1) 
        hi_=tf.cast(hi_,'float32')    
    batch_loss_ = loss_ / MAX_WORDS
    return batch_loss_

## Call the training and testing function

In [58]:
import time
for epoch in range(10):   
    print("Starting epoch {}".format(epoch+1))
    start=time.time()  
    
    hidden = encoder.initialise_hidden_unit()
    total_loss = 0  
    
    hidden_ = encoder.initialise_hidden_unit()
    total_loss_ = 0 
#---------------------------------------------------------------------------------------------- 
    for x_batch,y_batch in dataset_train: 
        batch_loss = train(x_batch, y_batch, hidden,MAX_WORDS) #calling the training loop
        total_loss += batch_loss
#----------------------------------------------------------------------------------------------
    for x_batch,y_batch in dataset_test: 
        batch_loss_ = test(x_batch, y_batch, hidden_,MAX_WORDS) #calling the testing loop
        total_loss_ += batch_loss_
#----------------------------------------------------------------------------------------------
        
    end=time.time()
    if (epoch + 1) % 10 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)
    print('Train Loss = {:.4f}\tVal Loss = {:.4f}\tTime taken = {:.2f} secs'.format(total_loss/(SAMPLES*0.9//BATCH_SIZE),
                                                                                    total_loss_/(SAMPLES*0.1//BATCH_SIZE),
                                                                                    (end-start))) 

Starting epoch 1
Train Loss = 4.0092	Val Loss = 3.7056	Time taken = 105.73 secs
Starting epoch 2
Train Loss = 3.5157	Val Loss = 3.4562	Time taken = 92.80 secs
Starting epoch 3
Train Loss = 3.2791	Val Loss = 3.2927	Time taken = 92.61 secs
Starting epoch 4
Train Loss = 3.0564	Val Loss = 3.1344	Time taken = 91.73 secs
Starting epoch 5
Train Loss = 2.8119	Val Loss = 2.9908	Time taken = 92.05 secs
Starting epoch 6
Train Loss = 2.5528	Val Loss = 2.8794	Time taken = 92.03 secs
Starting epoch 7
Train Loss = 2.3121	Val Loss = 2.8070	Time taken = 92.22 secs
Starting epoch 8
Train Loss = 2.0998	Val Loss = 2.7641	Time taken = 91.99 secs
Starting epoch 9
Train Loss = 1.9153	Val Loss = 2.7490	Time taken = 91.97 secs
Starting epoch 10
Train Loss = 1.7522	Val Loss = 2.7428	Time taken = 91.91 secs


# <font color='red'>TESTING</f>

## Restore the checkpoint

In [None]:
#checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

## Evaluate function

In [59]:
def evaluate(test,plot=False):
    
    test=' '.join(test.split()[:MAX_WORDS-2])
    test="<start> "+test+" <end>"
    test_ori=test
    
    test=tokenizerE.texts_to_sequences([test])
    test=pad_sequences(test,
                       maxlen=MAX_WORDS,
                       truncating='post',
                       padding='post',
                       dtype='float32')

    hidd=[tf.zeros((1,256)) for i in range(2)]
    testenou,testenhi=encoder(test,hidd)
    testhi=tf.expand_dims([tokenizerH.word_index['<start>']] * 1, 1)
    testhi=tf.cast(testhi,'float32')
    pred=''

    apn_att_wt=np.zeros((MAX_WORDS-1,MAX_WORDS))
    for i in range (1,MAX_WORDS):           
        testdo,testds,att_wt=decoder(testhi,testenou,testenhi)    
        apn_att_wt[i-1]=tf.reshape(att_wt[0],(MAX_WORDS))
        m=tf.math.argmax(testdo[0])

        pred+=rev_hin_dict[m.numpy()]+' '
        testhi=tf.expand_dims([m.numpy()] * 1, 1)
        testhi=tf.cast(testhi,'float32')
        testenhi=testds

    pre=''
    for word in pred.split():
        if word=='<end>':
            break
        else:
            pre+=word+' '

    if plot==True:       
        attention=np.reshape(apn_att_wt,(MAX_WORDS,MAX_WORDS-1))
        fig = plt.figure(figsize=(5,5))
        ax = fig.add_subplot(1,1,1)
        ax.matshow(attention, cmap='viridis')
        plt.show()         

    print(f"INPUT  =  {test_ori}\nHINDI  =  {pre}")

## Test

In [67]:
evaluate("In jail, Bhagat singh and his friends were on hunger strike for 64 days.", plot=False)

INPUT  =  <start> In jail, Bhagat singh and his friends were on hunger strike for 64 days. <end>
HINDI  =  जेल में भगत सिंह और बाकि साथियो ने ६४ दिनो तक भूख हद्ताल कि। 
