# CS6910 Assignment 3 - Sujay and Avyay

In [327]:
!pip install wandb
!pip install xtarfile
!pip install pyenchant



In [328]:
START_TOKEN="\t"
END_TOKEN="\n"

In [329]:
import os
import random
import time
import wandb
import re, string
import numpy as np
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from wordcloud import WordCloud, STOPWORDS
from collections import Counter

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

import os
from os.path import exists
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random 
import numpy as np
import keras
import tensorflow as tf
import xtarfile as tarfile
import csv
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.text import Tokenizer

# Loading the Data

In [330]:
def downloadDataSet():
   cwd = os.getcwd()
  
   file_exists = exists('./dakshina_dataset_v1.0.tar')
   if(file_exists==False):
     print('downloading....')
     os.system('curl -SL https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar > dakshina_dataset_v1.0.tar')
     print('download Complete')
   extract_exists = exists('./dakshina_dataset_v1.0/')   
   if(extract_exists==False): 
     print('Extracting..') 
     with tarfile.open('dakshina_dataset_v1.0.tar', 'r') as archive:
         archive.extractall()
     print('Complete')
   print('You are all set')
downloadDataSet()

You are all set


# Data Preprocessing

In [331]:
def get_files(language):

  language='te'
  train_dir='./dakshina_dataset_v1.0/'+language+'/lexicons/'+language+'.translit.sampled.train.tsv'
  val_dir='./dakshina_dataset_v1.0/'+language+'/lexicons/'+language+'.translit.sampled.dev.tsv'
  test_dir='./dakshina_dataset_v1.0/'+language+'/lexicons/'+language+'.translit.sampled.test.tsv'
  
  return train_dir, val_dir, test_dir




In [332]:
def tokenize(lang,tokenizer=None):
    """ Uses tf.keras tokenizer to tokenize the data/words into characters
    """
    if(tokenizer==None):
        tokenizer = Tokenizer(char_level=True)
        tokenizer.fit_on_texts(lang)
        lang_tensor = tokenizer.texts_to_sequences(lang)
        lang_tensor = tf.keras.preprocessing.sequence.pad_sequences(lang_tensor,
                                                            padding='post')
    else:
  
        lang_tensor = tokenizer.texts_to_sequences(lang)
        lang_tensor = tf.keras.preprocessing.sequence.pad_sequences(lang_tensor,
                                                        padding='post')

    return lang_tensor, tokenizer

In [333]:
def preprocess_data(fpath,ip_tokenizer=None, tgt_tokenizer=None):
    """ Reads, tokenizes and adds SOS/EOS tokens to data based on above functions
    """
    #Read data from files
    df = pd.read_csv(fpath, sep="\t", header=None)

    #Add start and end token
    df[0] = df[0].apply( lambda x:START_TOKEN+x+END_TOKEN) 
    ip_tensor, ip_tokenizer = tokenize(df[1].astype(str).tolist(), tokenizer=ip_tokenizer)
    
    tgt_tensor, tgt_tokenizer = tokenize(df[0].astype(str).tolist(), tokenizer=tgt_tokenizer) 
    
    dataset = tf.data.Dataset.from_tensor_slices((ip_tensor, tgt_tensor))
    dataset = dataset.shuffle(len(dataset))
    
    return dataset, ip_tokenizer, tgt_tokenizer



In [334]:
language="te"
train_dir, val_dir, test_dir = get_files(language)

dataset, input_tokenizer, targ_tokenizer = preprocess_data(train_dir)
val_dataset, _, _ = preprocess_data(val_dir)


In [335]:
#train data 
dataset, input_tokenizer, targ_tokenizer = preprocess_data(train_dir)

# Model Building

In [336]:
## Utility functions ##
def get_layer(name, units, dropout, return_state=False, return_sequences=False):

    if name=="rnn":
        return layers.SimpleRNN(units=units, dropout=dropout, 
                                return_state=return_state,
                                return_sequences=return_sequences)

    if name=="gru":
        return layers.GRU(units=units, dropout=dropout, 
                          return_state=return_state,
                          return_sequences=return_sequences)

    if name=="lstm":
        return layers.LSTM(units=units, dropout=dropout, 
                           return_state=return_state,
                           return_sequences=return_sequences)


In [337]:

class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, enc_state, enc_out):
    
    enc_state = tf.concat(enc_state, 1)
    enc_state = tf.expand_dims(enc_state, 1)

    score = self.V(tf.nn.tanh(self.W1(enc_state) + self.W2(enc_out)))

    attention_weights = tf.nn.softmax(score, axis=1)

    context_vector = attention_weights * enc_out
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights



In [338]:

class Encoder(tf.keras.Model):
    def __init__(self, cell_type, n_layers, units, encoder_vocab_size, embedding_dim, dropout):
        super(Encoder, self).__init__()
        self.layer_type = cell_type
        self.n_layers = n_layers
        self.units = units
        self.dropout = dropout
        self.embedding = tf.keras.layers.Embedding(encoder_vocab_size, embedding_dim)
        self.create_rnn_layers()

    def call(self, x, hidden):
        x = self.embedding(x)
        x = self.rnn_layers[0](x, initial_state=hidden)

        for layer in self.rnn_layers[1:]:
            x = layer(x)

        output, state = x[0], x[1:]

        return output, state
    
    def create_rnn_layers(self):
        self.rnn_layers = []

        for i in range(self.n_layers):
            self.rnn_layers.append(get_layer(self.layer_type, self.units, self.dropout,
                                                return_sequences=True,
                                                return_state=True))


    def initialize_hidden_state(self, batch_size):

        if self.layer_type != "lstm":
            return [tf.zeros((batch_size, self.units))]
        else:
            return [tf.zeros((batch_size, self.units))]*2



In [339]:
class Decoder(tf.keras.Model):
    def __init__(self, layer_type, n_layers, units, decoder_vocab_size, embedding_dim, dropout, attention=False):
        super(Decoder, self).__init__()

        self.layer_type = layer_type
        self.n_layers = n_layers
        self.units = units
        self.dropout = dropout
        self.attention = attention
        self.embedding_layer = layers.Embedding(input_dim=decoder_vocab_size, 
                                                output_dim=embedding_dim)
        
        self.dense = layers.Dense(decoder_vocab_size, activation="softmax")
        self.flatten = layers.Flatten()
        if self.attention:
            self.attention_layer = BahdanauAttention(self.units)
        self.create_rnn_layers()

    def call(self, x, hidden, enc_out=None):
        
        x = self.embedding_layer(x)

        if self.attention:
            context_vector, attention_weights = self.attention_layer(hidden, enc_out)
            x = tf.concat([tf.expand_dims(context_vector, 1), x], -1)
        else:
            attention_weights = None

        x = self.rnn_layers[0](x, initial_state=hidden)

        for layer in self.rnn_layers[1:]:
            x = layer(x)

        output, state = x[0], x[1:]

        output = self.dense(self.flatten(output))
        
        return output, state, attention_weights

    def create_rnn_layers(self):
        self.rnn_layers = []    

        for i in range(self.n_layers - 1):
            self.rnn_layers.append(get_layer(self.layer_type, self.units, self.dropout,
                                                return_sequences=True,
                                                return_state=True))
        
        self.rnn_layers.append(get_layer(self.layer_type, self.units, self.dropout,
                                            return_sequences=False,
                                            return_state=True))

In [340]:
class BeamSearch():
    def __init__(self, model, k):
        self.k = k 
        self.model = model
        self.acc = tf.keras.metrics.Accuracy()

    def sample_beam_search(self, probs):

        m, n = probs.shape
        output_sequences = [[[], 0.0]]

        for row in probs:
            beams = []

            for tup in output_sequences:
                seq, score = tup
                for j in range(n):
                    new_beam = [seq + [j], score - tf.math.log(row[j])]
                    beams.append(new_beam)

            output_sequences = sorted(beams, key=lambda x: x[1])[:self.k]

        tensors, scores = list(zip(*output_sequences))
        tensors = list(map(lambda x: tf.expand_dims(tf.constant(x),0), tensors))

        return tf.concat(tensors, 0), scores

    def beam_accuracy(self, input, target):
        accs = []

        for i in range(self.k):
            self.acc.reset_states()
            self.acc.update_state(target, input[i, :])  
            accs.append(self.acc.result())

        return max(accs)
    
    def step(self, input, target, enc_state):

        batch_acc = 0
        sequences = []

        enc_out, enc_state = self.model.encoder(input, enc_state)

        dec_state = enc_state
        dec_input = tf.expand_dims([self.model.targ_tokenizer.word_index["\t"]]*self.model.batch_size ,1)

        for t in range(1, target.shape[1]):

            preds, dec_state, _ = self.model.decoder(dec_input, dec_state, enc_out)

            sequences.append(preds)
            preds = tf.argmax(preds, 1)
            dec_input = tf.expand_dims(preds, 1)

        sequences = tf.concat(list(map(lambda x: tf.expand_dims(x, 1), sequences)), axis=1)

        for i in range(target.shape[0]):

            possibilities, scores = self.sample_beam_search(sequences[i, :, :])
            batch_acc += self.beam_accuracy(possibilities, target[i, 1:])

        batch_acc = batch_acc / target.shape[0]

        return 0, batch_acc

    def evaluate(self, test_dataset, batch_size=None, upto=5, use_wandb=False):
        
        if batch_size is not None:
            self.model.batch_size = batch_size
            test_dataset = test_dataset.batch(batch_size)
        else:
            self.model.batch_size = 1

        test_acc = 0
        enc_state = self.model.encoder.initialize_hidden_state(self.model.batch_size)

        for batch, (input, target) in enumerate(test_dataset.take(upto)):
           
           _, acc = self.step(input, target, enc_state)
           test_acc += acc

        if use_wandb:
            wandb.log({"test acc (beam search)": test_acc / upto})

        print(f"Test Accuracy on {upto*batch_size} samples: {test_acc / upto:.4f}\n")

    def translate(self, word):

        word = "\t" + word + "\n"
        sequences = []
        result = []

        inputs = self.model.input_tokenizer.texts_to_sequences([word])
        inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                               maxlen=self.model.max_input_len,
                                                               padding="post")


        enc_state = self.model.encoder.initialize_hidden_state(1)
        enc_out, enc_state = self.model.encoder(inputs, enc_state)

        dec_state = enc_state
        dec_input = tf.expand_dims([self.model.targ_tokenizer.word_index["\t"]]*1, 1)

        for t in range(1, self.model.max_target_len):

            preds, dec_state, _ = self.model.decoder(dec_input, dec_state, enc_out)

            sequences.append(preds)
            preds = tf.argmax(preds, 1)
            dec_input = tf.expand_dims(preds, 1)

        sequences = tf.concat(list(map(lambda x: tf.expand_dims(x, 1), sequences)), axis=1)

        possibilities, scores = self.sample_beam_search(tf.squeeze(sequences, 0))
        output_words = self.model.targ_tokenizer.sequences_to_texts(possibilities.numpy())
        
        def post_process(word):
            word = word.split(" ")[:-1]
            return "".join([x for x in word])

        output_words = list(map(post_process, output_words))

        return output_words, scores

In [352]:
class Parameters():
  def  __init__(self,  language='hi',encoder_layers=1,decoder_layers=1,embedding_dim=128,\
                layer_type='lstm', units=128, dropout=0.5, attention=False,batch_size=128,\
                apply_beam_search=False,teacher_forcing_ratio=1.0, save_outputs=None,epochs=5,wandb=False,beamWidth=5):
        self.language = language
        self.embedding_dim = embedding_dim
        self.encoder_layers=encoder_layers
        self.decoder_layers=decoder_layers
        self.layer_type = layer_type
        self.units = units
        self.dropout = dropout
        self.attention = attention
        self.stats = []
        self.wandb=wandb
        self.epochs=epochs
        self.batch_size = 128
        self.apply_beam_search = apply_beam_search
        self.batch_size = batch_size
        self.teacher_forcing_ratio=teacher_forcing_ratio
        self.save_outputs=save_outputs

In [342]:
from tqdm import tqdm
class Seq2Seq():
    def __init__(self, parameters):
        self.embedding_dim = parameters.embedding_dim
        self.encoder_layers = parameters.encoder_layers
        self.decoder_layers = parameters.decoder_layers
        self.layer_type = parameters.layer_type
        self.units = parameters.units
        self.dropout = parameters.dropout
        self.attention = parameters.attention
        self.stats = []
        self.batch_size = parameters.batch_size
        self.apply_beam_search = parameters.apply_beam_search
    
    def build(self, loss, optimizer, metric):
        self.loss = loss
        self.optimizer = optimizer
        self.metric = metric

    def set_vocabulary(self, input_tokenizer, targ_tokenizer):
        self.input_tokenizer = input_tokenizer
        self.targ_tokenizer = targ_tokenizer
        self.create_model()
    
    def create_model(self):

        encoder_vocab_size = len(self.input_tokenizer.word_index) + 1
        decoder_vocab_size = len(self.targ_tokenizer.word_index) + 1

        self.encoder = Encoder(self.layer_type, self.encoder_layers, self.units, encoder_vocab_size,
                               self.embedding_dim, self.dropout)

        self.decoder = Decoder(self.layer_type, self.decoder_layers, self.units, decoder_vocab_size,
                               self.embedding_dim,  self.dropout, self.attention)

    @tf.function
    def train(self, input, target, enc_state):

        loss = 0 

        with tf.GradientTape() as tape: 

            enc_out, enc_state = self.encoder(input, enc_state)

            dec_state = enc_state
            dec_input = tf.expand_dims([self.targ_tokenizer.word_index["\t"]]*self.batch_size ,1)

            ## We use Teacher forcing to train the network
            ## Each target at timestep t is passed as input for timestep t + 1

            if random.random() < self.teacher_forcing_ratio:

                for t in range(1, target.shape[1]):

                    preds, dec_state, _ = self.decoder(dec_input, dec_state, enc_out)
                    loss += self.loss(target[:,t], preds)
                    self.metric.update_state(target[:,t], preds)
                    
                    dec_input = tf.expand_dims(target[:,t], 1)
            
            else:

                for t in range(1, target.shape[1]):

                    preds, dec_state, _ = self.decoder(dec_input, dec_state, enc_out)
                    loss += self.loss(target[:,t], preds)
                    self.metric.update_state(target[:,t], preds)

                    preds = tf.argmax(preds, 1)
                    dec_input = tf.expand_dims(preds, 1)


            batch_loss = loss / target.shape[1]

            variables = self.encoder.variables + self.decoder.variables
            gradients = tape.gradient(loss, variables)

            self.optimizer.apply_gradients(zip(gradients, variables))

        return batch_loss, self.metric.result()

    @tf.function
    def validation(self, input, target, enc_state):

        loss = 0
        
        enc_out, enc_state = self.encoder(input, enc_state)

        dec_state = enc_state
        dec_input = tf.expand_dims([self.targ_tokenizer.word_index["\t"]]*self.batch_size ,1)

        for t in range(1, target.shape[1]):

            preds, dec_state, _ = self.decoder(dec_input, dec_state, enc_out)
            loss += self.loss(target[:,t], preds)
            self.metric.update_state(target[:,t], preds)

            preds = tf.argmax(preds, 1)
            dec_input = tf.expand_dims(preds, 1)

        batch_loss = loss / target.shape[1]
        
        return batch_loss, self.metric.result()


    def fit(self, dataset, val_dataset, batch_size=128, epochs=5, wandb=False, teacher_forcing_ratio=1.0):

        self.batch_size = batch_size
        self.teacher_forcing_ratio = teacher_forcing_ratio

        steps_per_epoch = len(dataset) // self.batch_size
        steps_per_epoch_val = len(val_dataset) // self.batch_size
        
        dataset = dataset.batch(self.batch_size, drop_remainder=True)
        val_dataset = val_dataset.batch(self.batch_size, drop_remainder=True)

        # useful when we need to translate the sentence
        sample_inp, sample_targ = next(iter(dataset))
        self.max_target_len = sample_targ.shape[1]
        self.max_input_len = sample_inp.shape[1]

        

        
        for epoch in  tqdm(range(1, epochs+1), total = epochs,desc="Epochs "):
           

            ## Training loop ##
            total_loss = 0
            total_acc = 0
            self.metric.reset_states()

            starting_time = time.time()
            enc_state = self.encoder.initialize_hidden_state(self.batch_size)

           
            for batch, (input, target) in enumerate(dataset.take(steps_per_epoch)):
                batch_loss, acc = self.train(input, target, enc_state)
                total_loss += batch_loss
                total_acc += acc

            avg_acc = total_acc / steps_per_epoch
            avg_loss = total_loss / steps_per_epoch

            # Validation loop ##
            total_val_loss = 0
            total_val_acc = 0
            self.metric.reset_states()

            enc_state = self.encoder.initialize_hidden_state(self.batch_size)

            
            for batch, (input, target) in enumerate(val_dataset.take(steps_per_epoch_val)):
                batch_loss, acc = self.validation(input, target, enc_state)
                total_val_loss += batch_loss
                total_val_acc += acc

            avg_val_acc = total_val_acc / steps_per_epoch_val
            avg_val_loss = total_val_loss / steps_per_epoch_val

            print( "\nTrain Loss: {0:.4f} Train Accuracy: {1:.4f} Validation Loss: {2:.4f} Validation Accuracy: {3:.4f}".format(avg_loss, avg_acc*100, avg_val_loss, avg_val_acc*100))
            
            time_taken = time.time() - starting_time
            self.stats.append({"epoch": epoch,
                            "train loss": avg_loss,
                            "val loss": avg_val_loss,
                            "train acc": avg_acc*100,
                            "val acc": avg_val_acc*100,
                            "training time": time_taken})
            
            if wandb:
                wandb.log(self.stats[-1])
            
            print(f"\nTime taken for the epoch {time_taken:.4f}")
           
        
        print("\nModel trained successfully !!")
        
    def evaluate(self, test_dataset, batch_size=None):

        if batch_size is not None:
            self.batch_size = batch_size

        steps_per_epoch_test = len(test_dataset) // batch_size
        test_dataset = test_dataset.batch(batch_size, drop_remainder=True)
        
        total_test_loss = 0
        total_test_acc = 0
        self.metric.reset_states()

        enc_state = self.encoder.initialize_hidden_state(self.batch_size)

        print("\nRunning test dataset through the model...\n")
        for batch, (input, target) in enumerate(test_dataset.take(steps_per_epoch_test)):
            batch_loss, acc = self.validation(input, target, enc_state)
            total_test_loss += batch_loss
            total_test_acc += acc

        avg_test_acc = total_test_acc / steps_per_epoch_test
        avg_test_loss = total_test_loss / steps_per_epoch_test
    
        print(f"Test Loss: {avg_test_loss:.4f} Test Accuracy: {avg_test_acc:.4f}")

        return avg_test_loss, avg_test_acc


    def translate(self, word, get_heatmap=False):

        word = "\t" + word + "\n"

        inputs = self.input_tokenizer.texts_to_sequences([word])
        inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                               maxlen=self.max_input_len,
                                                               padding="post")

        result = ""
        att_wts = []

        enc_state = self.encoder.initialize_hidden_state(1)
        enc_out, enc_state = self.encoder(inputs, enc_state)

        dec_state = enc_state
        dec_input = tf.expand_dims([self.targ_tokenizer.word_index["\t"]]*1, 1)

        for t in range(1, self.max_target_len):

            preds, dec_state, attention_weights = self.decoder(dec_input, dec_state, enc_out)
            
            if get_heatmap:
                att_wts.append(attention_weights)
            
            preds = tf.argmax(preds, 1)
            next_char = self.targ_tokenizer.index_word[preds.numpy().item()]
            result += next_char

            dec_input = tf.expand_dims(preds, 1)

            if next_char == "\n":
                return result[:-1], att_wts[:-1]

        return result[:-1], att_wts[:-1]

    

In [343]:


def run_model_on_test_dataset(param):

    ## Character level accuracy ##
    test_dataset, _, _ = preprocess_data(test_dir, model.input_tokenizer, model.targ_tokenizer)
    test_loss, test_acc = model.evaluate(test_dataset, batch_size=100)
    print('Character level accuracy: '+str(test_acc.numpy()))

    ##  Word level accuracy ##
    test_tsv = pd.read_csv(test_dir, sep="\t", header=None)
    inputs = test_tsv[1].astype(str).tolist()
    targets = test_tsv[0].astype(str).tolist()
   
    outputs = []

    for word in inputs:
        outputs.append(model.translate(word)[0])

    print(f"Word level accuracy: {np.sum(np.asarray(outputs) == np.array(targets)) / len(outputs)}")

    if param.save_outputs is not None:
        df = pd.DataFrame()
        df["inputs"] = inputs
        df["targets"] = targets
        df["outputs"] = outputs
        df.to_csv(param.save_outputs)


    return model

#randomly_evaluate(model, n=15)

In [353]:
param=Parameters(language="te",\
                        embedding_dim=64,\
                        encoder_layers=3,\
                        decoder_layers=2,\
                        layer_type="lstm",\
                        units=512,\
                        dropout=0.5,
                        epochs=25\
                   )

In [None]:



model = Seq2Seq(param)
model.set_vocabulary(input_tokenizer, targ_tokenizer)
model.build(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer = tf.keras.optimizers.Adam(learning_rate=0.001),
                metric = tf.keras.metrics.SparseCategoricalAccuracy())

model.fit(dataset, val_dataset, epochs=param.epochs, wandb=param.wandb, teacher_forcing_ratio=param.teacher_forcing_ratio)                  

In [346]:
model = run_model_on_test_dataset(param)


Running test dataset through the model...

Test Loss: 1.1889 Test Accuracy: 0.8956
Character level accuracy: 0.8955958
Word level accuracy: 0.4793805463720202


In [472]:

def evaluate_samples(model, n=0):

    df = pd.read_csv(test_dir, sep="\t", header=None)

    printLine=True
    source=[]
    actual=[]
    predict=[]
    if(n==0):
      print(f"Evaluating the model on all words\n")
      n=len(df)
      printLine=False
    else:
      df = df.sample(n=n).reset_index(drop=True)
      #print(f"Randomly evaluating the model on {n} words\n")

    for i in range(n):
        word = str(df[1][i])
        source.append(word)
        actual.append(str(df[0][i]))
        predict.append(model.translate(word)[0])
        """if(printLine):
          print(f"Input word: {word}")
          print(f"Actual translation: {str(df[0][i])}")
          print(f"Model translation: {predict[-1]}\n")"""
    return source,actual,predict
    

In [400]:
source,actual,predict=evaluate_samples(model)

Evaluating the model on all words



In [485]:
from IPython.display import HTML as html_print
from IPython.display import display
def compare_result(predict,actual,word):
    string1=list(actual)
    string2=list(predict)
    maxstr=None
    
    minstr=None
    if(len(string1)>len(string2)):
      maxstr=string1
      minstr=string2
    else:
      maxstr=string2
      minstr=string1
   
    shift=0
    html_str = """
    <br>
      <table style="border:2px solid black; border-collapse:collapse">
      <tr>
      <td scope="row" style="border:1px solid black;background-color:#3498DB;color:white;padding:10px;text-align:left"> <strong>{}</strong> </td>
      """.format(actual)
    maxlen=len(maxstr)
    minlen=len(minstr)
    char_html=""
    j=0
    for i in range(maxlen):
    
        if(j<minlen):
          if(maxstr[i]==minstr[j]):
              char_html+="""<td scope="row" style="background-color:#E3FFE7 ;border:1px solid black;padding:15px;text-align:left"> <strong>{}</strong> </td>""".format(maxstr[i] )
              j+=1
          else:
             if(i+1<maxlen and  maxstr[i+1]==minstr[j]):
                char_html+=""" <td scope="row" style="background-color:#FFF6F7 ; border:1px solid black;padding:15px;text-align:left"> <strong>{}</strong> </td>""".format(maxstr[i] )
                continue
             else:
              j+=1
              char_html+=""" <td scope="row" style="background-color:#FFF6F7 ; border:1px solid black;padding:15px;text-align:left"> <strong>{}</strong> </td>""".format(maxstr[i] )
        else:
          char_html+="""<td scope="row" style="background-color: #FFF6F7 ; border:1px solid black;padding:15px;text-align:left"> <strong>{} </strong></td>""".format(maxstr[i] )   

    char_html+="""</tr>"""
    char_html+="""<tr>
      <td scope="row" style="border:1px solid black;background-color:#3498DB;padding:15px;color:white;text-align:left"> <strong>{} </strong></td>""".format(predict)

    j=0
    for i in range(maxlen):

        if(j<minlen):
        
          if(maxstr[i]==minstr[j]):
            char_html+="""<td scope="row" style="background-color:#E3FFE7 ;border:1px solid black;padding:15px;text-align:left"><strong> {} </strong></td>""".format(minstr[j] )
            j+=1
          else:
             if(i+1<maxlen and  maxstr[i+1]==minstr[j]):
                char_html+="""<td scope="row" style="background-color:#FFF6F7 ; border:1px solid black;padding:15px;text-align:left"> <strong>{}</strong> </td>""".format(' ' )
                continue
             else:
                
                char_html+="""<td scope="row" style="background-color:#FFF6F7 ; border:1px solid black;padding:15px;text-align:left"> <strong>{}</strong> </td>""".format(minstr[j] )
                j+=1
        else:
          char_html+="""<td scope="row" style="background-color:#FFF6F7  ; border:1px solid black;padding:15px;text-align:left"><strong> {}</strong> </td>""".format('' )



    char_html+="""</tr>"""

    char_html+="""<tr>
      <td scope="row" style="border:1px solid black;background-color:#3498DB;padding:15px;color:white;text-align:left"> <strong>{} </strong></td>""".format(word)


    j=0
    for i in range(maxlen):
    
        if(j<minlen):
          if(maxstr[i]==minstr[j]):
              char_html+="""<td scope="row" style="border:1px solid black;padding:15px;text-align:left">&#x2705; </td>"""
              j+=1
          else:
              if(i+1<maxlen and  maxstr[i+1]==minstr[j]):
                char_html+="""<td scope="row" style="border:1px solid black;padding:15px;text-align:left"> &#x274C; </td>""" 
                continue
              else:
                j+=1
                char_html+=""" <td scope="row" style=" border:1px solid black;padding:15px;text-align:left"> &#x274C; </td>"""
        else:
          char_html+="""<td scope="row" style="border:1px solid black;padding:15px;text-align:left"> &#x274C; </td>""" 


    char_html+="""</tr>"""
    html_str+=char_html
    display(html_print(html_str)) 

In [486]:
source_rnd,actual_rnd,predict_rnd=evaluate_samples(model,10)

for i,j,k in zip(source_rnd,actual_rnd,predict_rnd):
    compare_result(j,k,i)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
ఆస్ట్రీలోలోని,ఆ,స,్,ట,్,ర,ే,ల,ి,య,ా,ల,ో,న,ి
ఆస్ట్రేలియాలోని,ఆ,స,్,ట,్,ర,ీ,ల,ో,ల,ో,న,ి,,
aastreliyaalooni,✅,✅,✅,✅,✅,✅,❌,✅,❌,❌,❌,❌,❌,❌,❌


0,1,2,3,4,5,6,7,8
పంచవభ్తం,ప,ం,చ,త,ం,త,్,ర
పంచతంత్ర,ప,ం,చ,వ,భ,,్,త
panchathanthra,✅,✅,✅,❌,❌,❌,✅,❌


0,1,2,3,4,5,6,7,8,9
పునరావశం,ప,ు,న,ర,ా,వ,ా,స,ం
పునరావాసం,ప,ు,న,ర,ా,వ,శ,,ం
punaraavaasam,✅,✅,✅,✅,✅,✅,❌,❌,✅


0,1,2,3,4,5,6,7,8,9,10,11
ఎతిత్రుక్తం,ఎ,త,ి,త,్,ర,ు,క,్,త,ం
ఇతివృత్తం,ఇ,త,ి,వ,ృ,త,్,త,ం,,
etivruttam,❌,✅,✅,❌,❌,❌,❌,❌,❌,❌,❌


0,1,2,3,4
పొలం,ప,ొ,ల,ం
పొలం,ప,ొ,ల,ం
polam,✅,✅,✅,✅


0,1,2,3,4,5
ఈపాని,ఈ,ప,ా,న,ి
ఏపని,ఏ,ప,,న,ి
epani,❌,✅,❌,✅,✅


0,1,2,3,4,5,6,7,8,9
సేకరించిన,స,ే,క,ర,ి,ం,చ,ి,న
సేకరించిన,స,ే,క,ర,ి,ం,చ,ి,న
saekarinchina,✅,✅,✅,✅,✅,✅,✅,✅,✅


0,1,2,3,4,5,6
వన్ని,వ,ా,ణ,్,ణ,ి
వాణ్ణి,వ,న,,్,న,ి
vanni,✅,❌,❌,✅,❌,✅


0,1,2,3,4,5,6,7
ఈమాత్రం,ఏ,మ,ా,త,్,ర,ం
ఏమాత్రం,ఈ,మ,ా,త,్,ర,ం
aemaathram,❌,✅,✅,✅,✅,✅,✅


0,1,2,3,4,5
కరాచి,క,ర,ా,చ,ీ
కరాచీ,క,ర,ా,చ,ి
qarachi,✅,✅,✅,✅,❌
