In [1]:
# Shrirang Alias Samarth Patil
# 19BAI10079

# Libraries
import numpy as np
import pandas as pd
import os,unicodedata, re, io, time, gc, warnings
warnings.filterwarnings("ignore")

from tabulate import tabulate

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

In [2]:
# Dataset of Marathi to English
data = '../input/marathi-english-sentence-pairs/mar.txt'
df = pd.read_csv(data, sep='\t',header=None, names=['Eng','Mar','Trn'])

#Dropping the Trn Column
df.drop('Trn',axis=1, inplace=True)

#Shape
print("Total Records: ", df.shape[0])

Total Records:  39961


In [3]:
#PreProcessing Function
def preprocess(w):
    w = w.lower().strip()
    w = re.sub(r"([?.!,¿।])", r" \1 ", w)
    w = w.strip()
    w = '<start> ' + w + ' <end>'
    return w

#Applying PreProcess Function to a single sentence
x = np.random.randint(1,df.shape[0])
print("English: ", preprocess(df.Eng[x]))
print("Marathi: ", preprocess(df.Mar[x]))

#applying the preprocess function
df['Eng'] = df['Eng'].apply(lambda x: preprocess(x))
df['Mar'] = df['Mar'].apply(lambda x: preprocess(x))

#Inspect
df.head()

English:  <start> he has 12 sons . <end>
Marathi:  <start> त्याला १२ मुलं आहेत . <end>


Unnamed: 0,Eng,Mar
0,<start> go . <end>,<start> जा . <end>
1,<start> run ! <end>,<start> पळ ! <end>
2,<start> run ! <end>,<start> धाव ! <end>
3,<start> run ! <end>,<start> पळा ! <end>
4,<start> run ! <end>,<start> धावा ! <end>


In [4]:
#Tokenize Function
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')

    return tensor, lang_tokenizer


'''Tokenize Column Data'''
# Input = English || # Output = Marathi
input_tensor, inp_lang = tokenize(df['Eng'])
target_tensor, targ_lang = tokenize(df['Mar'])

In [5]:
# To train faster, we can limit the size of the dataset to 100 sentences 
num_examples = 100

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

# Creating training and validation sets using an 90-10 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.1)

# Show length
tb_data = [["Eng Train Tensor", len(input_tensor_train)], ["Mar Train Tensor",len(target_tensor_train)], 
           ["Eng Val Tensor", len(input_tensor_val)], ["Mar Val Tensor",len(target_tensor_val)]] 

print(tabulate(tb_data, headers=['','Lengths']))

                    Lengths
----------------  ---------
Eng Train Tensor      35964
Mar Train Tensor      35964
Eng Val Tensor         3997
Mar Val Tensor         3997


In [6]:
#Create TensorFlow Dataset

BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 128
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

#Example Dataset
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

2021-12-18 13:51:42.366183: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-18 13:51:42.454881: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-18 13:51:42.455659: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-18 13:51:42.458125: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

(TensorShape([128, 44]), TensorShape([128, 40]))

In [7]:
#Encoder Class

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [8]:
#Encoding 
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# Sample Input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

2021-12-18 13:51:45.833958: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Encoder output shape: (batch size, sequence length, units) (128, 44, 1024)
Encoder Hidden state shape: (batch size, units) (128, 1024)


In [9]:
# Bahdanau Attention Class

class BahdanauAttention(tf.keras.layers.Layer):
    
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [10]:
# Attention
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (128, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (128, 44, 1)


In [11]:
# Decoder Class

class Decoder(tf.keras.Model):
    
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [12]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (128, 13890)


In [13]:
# Optimizer and Loss function
optimizer = tf.keras.optimizers.Adamax()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [14]:
#Check Points
checkpoint_dir = './output/'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder)

In [15]:
# Train Model Function

@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)
            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [16]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

2021-12-18 13:52:21.994576: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1 Batch 0 Loss 1.5855
Epoch 1 Batch 100 Loss 0.9089
Epoch 1 Batch 200 Loss 0.8471
Epoch 1 Loss 0.9318
Time taken for 1 epoch 170.23243737220764 sec

Epoch 2 Batch 0 Loss 0.8668
Epoch 2 Batch 100 Loss 0.8686
Epoch 2 Batch 200 Loss 0.7680
Epoch 2 Loss 0.8117
Time taken for 1 epoch 128.59540605545044 sec

Epoch 3 Batch 0 Loss 0.7535
Epoch 3 Batch 100 Loss 0.7708
Epoch 3 Batch 200 Loss 0.7652
Epoch 3 Loss 0.7673
Time taken for 1 epoch 127.95656251907349 sec

Epoch 4 Batch 0 Loss 0.7168
Epoch 4 Batch 100 Loss 0.6768
Epoch 4 Batch 200 Loss 0.6776
Epoch 4 Loss 0.7217
Time taken for 1 epoch 128.73822689056396 sec

Epoch 5 Batch 0 Loss 0.6727
Epoch 5 Batch 100 Loss 0.6510
Epoch 5 Batch 200 Loss 0.6523
Epoch 5 Loss 0.6850
Time taken for 1 epoch 127.95248126983643 sec

Epoch 6 Batch 0 Loss 0.6585
Epoch 6 Batch 100 Loss 0.6656
Epoch 6 Batch 200 Loss 0.6497
Epoch 6 Loss 0.6564
Time taken for 1 epoch 128.68668484687805 sec

Epoch 7 Batch 0 Loss 0.6778
Epoch 7 Batch 100 Loss 0.6231
Epoch 7 Batc

In [17]:
gc.collect()

21

In [18]:
# Evaluate Function

def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [19]:
#Function to Translate
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [20]:
#Translate
translate("this is book")

Input: <start> this is book <end>
Predicted translation: हे पुस्तक आहे . <end> 
