In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 1. Load dataset and Tokenize

In [None]:
!pip install tensorflow-addons==0.11.2

Collecting tensorflow-addons==0.11.2
  Downloading tensorflow_addons-0.11.2-cp37-cp37m-manylinux2010_x86_64.whl (1.1 MB)
[?25l[K     |▎                               | 10 kB 26.0 MB/s eta 0:00:01[K     |▋                               | 20 kB 29.6 MB/s eta 0:00:01[K     |█                               | 30 kB 23.5 MB/s eta 0:00:01[K     |█▏                              | 40 kB 19.0 MB/s eta 0:00:01[K     |█▌                              | 51 kB 14.9 MB/s eta 0:00:01[K     |█▉                              | 61 kB 13.3 MB/s eta 0:00:01[K     |██                              | 71 kB 12.7 MB/s eta 0:00:01[K     |██▍                             | 81 kB 13.7 MB/s eta 0:00:01[K     |██▊                             | 92 kB 14.5 MB/s eta 0:00:01[K     |███                             | 102 kB 11.4 MB/s eta 0:00:01[K     |███▎                            | 112 kB 11.4 MB/s eta 0:00:01[K     |███▋                            | 122 kB 11.4 MB/s eta 0:00:01[K     |███▉    

In [None]:
from pickle import load
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow_addons as tfa
import time
import numpy as np
from nltk.translate.bleu_score import corpus_bleu

 The versions of TensorFlow you are currently using is 2.6.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [None]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

loc = 'drive/MyDrive/nmt_test/'

# load datasets
dataset = load_clean_sentences(loc + 'english-nepali-both.pkl')
train = load_clean_sentences(loc + 'english-nepali-train.pkl')
test = load_clean_sentences(loc + 'english-nepali-test.pkl')

In [None]:
def create_dataset(dataset):
    en = []
    ne = []
    for line in dataset:
        words1 = line[0].split(' ')
        words2 = line[1].split(' ')
        words1.append('<end>')
        words1.insert(0, '<start>')
        words2.append('<end>')
        words2.insert(0, '<start>')
        en.append(words1)
        ne.append(words2)
    return en, ne

In [None]:
def tokenize(lang):
    # lang = list of sentences in a language
    
    # print(len(lang), "example sentence: {}".format(lang[0]))
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    
    ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) 
    ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)
#     tensor = lang_tokenizer.texts_to_sequences(lang)
    
    # tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences 
    ## and pads the sequences to match the longest sequences in the given input
#     tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
    
    return lang_tokenizer

In [None]:
def tf_max_length(tensor):
    return max(len(t) for t in tensor)

In [None]:
# tokenize
target_lang, input_lang = create_dataset(dataset)
inp_lang_tokenizer = tokenize(input_lang)
targ_lang_tokenizer = tokenize(target_lang)

In [None]:
# load datasets
def load_dataset(dataset):
    targ_lang, inp_lang = create_dataset(train)
#     input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
#     target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(inp_lang_tokenizer.texts_to_sequences(inp_lang),
                                                                 padding='post')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(targ_lang_tokenizer.texts_to_sequences(targ_lang),
                                                                 padding='post')
    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [None]:
input_tensor_train, target_tensor_train, inp_lang, targ_lang = load_dataset(train)
max_length_targ, max_length_inp = tf_max_length(target_tensor_train), tf_max_length(input_tensor_train)

### 2. Train and Test Split

In [None]:
# input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
print("Length")
print("input_tensor_train = {}".format(len(input_tensor_train)))
print("target_tensor_train = {}".format(len(target_tensor_train)))

Length
input_tensor_train = 3000
target_tensor_train = 3000


### 3. Index to Word Mapping

In [None]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            print ("%d ----> %s" % (t, lang.index_word[t]))
    
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
3226 ----> सहरमा
98 ----> निजी
5698 ----> वाहन
42 ----> प्रयोग
7 ----> गर्न
517 ----> प्रतिबन्ध
1360 ----> लगाइएको
12 ----> थियो।
2 ----> <end>

Target Language; index to word mapping
2 ----> <start>
120 ----> private
2894 ----> vehicle
98 ----> use
16 ----> was
2895 ----> banned
6 ----> in
1 ----> the
771 ----> city
3 ----> <end>


### 4. Some important parameters

In [None]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64

# Some important parameters
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

embedding_dim = 128
units = 256
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE

print("Vocabulary Sizes")
print("vocab_inp_size = {}".format(vocab_inp_size))
print("vocab_tar_size = {}".format(vocab_tar_size))

Vocabulary Sizes
vocab_inp_size = 10797
vocab_tar_size = 6882


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

### 5. Encoder

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        ##-------- LSTM layer in Encoder ------- ##
        self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    def call(self, x, hidden):
        x = self.embedding(x)
        output, h, c = self.lstm_layer(x, initial_state = hidden)
        return output, h, c
    
    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))] 

### 6. Decoder

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        # self.attention_type = attention_type

        # Embedding Layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        #Final Dense layer on which softmax will be applied
        self.fc = tf.keras.layers.Dense(vocab_size)

        # Define the fundamental cell for decoder recurrent structure
        self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)

        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()

        # Create attention mechanism with memory = None
        self.attention_mechanism = self.build_attention_mechanism(self.dec_units, None, self.batch_sz*[max_length_inp])

        # Wrap attention mechanism with the fundamental rnn cell of decoder
        self.rnn_cell = self.build_rnn_cell(batch_sz)

        # Define the decoder with respect to fundamental rnn cell
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)

    def build_rnn_cell(self, batch_sz):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, 
                                      self.attention_mechanism, attention_layer_size=self.dec_units)
        return rnn_cell
    
    def build_attention_mechanism(self, dec_units, memory, memory_sequence_length):
        # ------------- #
        # typ: Which sort of attention (Bahdanau, Luong)
        # dec_units: final dimension of attention outputs 
        # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
        # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)
        return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

    def build_initial_state(self, batch_sz, encoder_state, Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
        return decoder_initial_state
    
    def call(self, inputs, initial_state):
        x = self.embedding(inputs)
        outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_targ-1])
        return outputs


### 7. Define the optimizer and the loss function

In [None]:
optimizer = tf.keras.optimizers.Adam()

def loss_function(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)  
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss  

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [None]:
import os
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

### 8. Train

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    
    with tf.GradientTape() as tape:
        enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


        dec_input = targ[ : , :-1 ] # Ignore <end> token
        real = targ[ : , 1: ]       # ignore <start> token

        # Set the AttentionMechanism object with encoder_outputs
        decoder.attention_mechanism.setup_memory(enc_output)

        # Create AttentionWrapperState as initial_state for decoder
        decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
        pred = decoder(dec_input, decoder_initial_state)
        logits = pred.rnn_output
        loss = loss_function(real, logits)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

In [None]:
EPOCHS = 500

for epoch in range(EPOCHS):
    start = time.time()
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    # print(enc_hidden[0].shape, enc_hidden[1].shape)

    for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))

    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)
        
        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 5.0079
Epoch 2 Batch 0 Loss 3.3390
Epoch 2 Loss 3.3860
Time taken for 1 epoch 11.396674156188965 sec

Epoch 3 Batch 0 Loss 3.1297
Epoch 4 Batch 0 Loss 3.1831
Epoch 4 Loss 3.1977
Time taken for 1 epoch 11.379096269607544 sec

Epoch 5 Batch 0 Loss 3.2662
Epoch 6 Batch 0 Loss 2.9294
Epoch 6 Loss 2.9767
Time taken for 1 epoch 11.310560703277588 sec

Epoch 7 Batch 0 Loss 2.8152
Epoch 8 Batch 0 Loss 2.7645
Epoch 8 Loss 2.7614
Time taken for 1 epoch 11.346597671508789 sec

Epoch 9 Batch 0 Loss 2.7628
Epoch 10 Batch 0 Loss 2.4254
Epoch 10 Loss 2.5485
Time taken for 1 epoch 11.393122434616089 sec

Epoch 11 Batch 0 Loss 2.4091
Epoch 12 Batch 0 Loss 2.1541
Epoch 12 Loss 2.3734
Time taken for 1 epoch 11.317896842956543 sec

Epoch 13 Batch 0 Loss 1.9386
Epoch 14 Batch 0 Loss 2.0259
Epoch 14 Loss 2.2097
Time taken for 1 epoch 11.412219047546387 sec

Epoch 15 Batch 0 Loss 1.9235
Epoch 16 Batch 0 Loss 2.1218
Epoch 16 Loss 2.0551
Time taken for 1 epoch 11.370058298110962 sec

Epoch

### 9. Evaluate Sentence

In [None]:
import unicodedata
import re
import string

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
#     w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
#     w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.rstrip().strip()
    return w

In [None]:
def evaluate_sentence(sentence):
#     sentence = preprocess_sentence(sentence)
    
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_inp,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs)
    inference_batch_size = inputs.shape[0]
    result = ''

    enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    dec_h = enc_h
    dec_c = enc_c

    start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<start>'])
    end_token = targ_lang.word_index['<end>']

    greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

    # Instantiate BasicDecoder object
    decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
    # Setup Memory in decoder stack
    decoder.attention_mechanism.setup_memory(enc_out)

    # set decoder_initial_state
    decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)


    ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
    ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
    ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

    decoder_embedding_matrix = decoder.embedding.variables[0]

    outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
    return outputs.sample_id.numpy()

def translate(sentence):
    result = evaluate_sentence(sentence)
    print(result)
    result = targ_lang.sequences_to_texts(result)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f0734a76410>

In [None]:
translate(u'वर्षमा कुल पैठारीमा औद्योगिक सामानहरुको वाहुल्यता रह्यो')

[[  1  65  65 105  26 453   3]]
Input: वर्षमा कुल पैठारीमा औद्योगिक सामानहरुको वाहुल्यता रह्यो
Predicted translation: ['the economic economic population has started <end>']


### 10. Test

In [None]:
# evaluate the skill of the model
def evaluate_model(raw_dataset):
    actual, predicted = list(), list()
    
    for i, source in enumerate(raw_dataset):
        raw_target, raw_src = source[0], source[1]
        result = evaluate_sentence(raw_src)
        translation = targ_lang.sequences_to_texts(result)[0]
        
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))    
            
        actual.append([raw_target.split()])
        predicted.append(translation.split())
        
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
train[0][1]

'सहरमा निजी वाहन प्रयोग गर्न प्रतिबन्ध लगाइएको थियो।'

In [None]:
# test on some training sequences
print('train')
evaluate_model(train)
# test on some test sequences
print('test')
evaluate_model(test)

train
src=[सहरमा निजी वाहन प्रयोग गर्न प्रतिबन्ध लगाइएको थियो।], target=[private vehicle use was banned in the city], predicted=[private vehicle use was banned in the export <end>]
src=[who ले मानिसहरूलाई नधोएको हातले आँखा नाक वा मुख छुनुबाट टाढा रहन सल्लाह दिन्छ।], target=[the who advises people to avoid touching the eyes nose or mouth with unwashed hands], predicted=[the who advises people who nose with the eyes nose and mouth with unwashed hands <end>]
src=[covid को crp स्तर र गम्भीरता र रोग पूर्वानुमानको सहसम्बन्ध पनि प्रस्तावित गरिएको छ।], target=[the correlation of crp level to the severity and prognosis of covid has also been proposed], predicted=[the correlation of crp level to the severity and prognosis of covid has been proposed <end>]
src=[यो बृद्धि दर विगत तीन दशकको बृद्धि दर भन्दा बढी छ।], target=[this growth rate is higher than that of the last three decades], predicted=[this growth rate is higher than that if the growth rate of growth in per increase is too <end>]
src=[प