In [1]:
import torch
import numpy as np
from torchmetrics import Accuracy
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
class Script:
    def __init__(self, script_name):
        self.script_name = script_name
        self.char2idx = {}
        self.inx2char = {}
        self.vocab_size = 0

    def create_vocab(self, char_list):
        for i, char in enumerate(char_list):
            self.char2idx[char] = i
            self.inx2char[i] = char
        self.vocab_size = len(char_list)
    
    def add_char(self, char):
        if char not in self.char2idx:
            self.char2idx[char] = self.vocab_size
            self.inx2char[self.vocab_size] = char
            self.vocab_size += 1
        else:
            print("Character already exists in the script")



In [3]:
import os
dataset_name = "aksharantar_sampled"
languages_dataset = os.listdir(dataset_name)
print(languages_dataset)

['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'kas', 'kok', 'mai', 'mal', 'mar', 'mni', 'ori', 'pan', 'san', 'sid', 'tam', 'tel', 'urd']


In [4]:
language = 'mal'
START='<'
END='>'
def load_dataset_csv(path):
    X, y = [], []
    with open(path, 'r', encoding='UTF-8') as f:
        for line in f:
            line = line.strip().split(',')
            X.append(f'{START}{line[0]}{END}')
            y.append(f'{START}{line[1]}{END}')
    
    return X, y

list_files = os.listdir(f'{dataset_name}/{language}')
path = f'{dataset_name}/{language}'



X_test, y_test = load_dataset_csv(f'{path}/{list_files[0]}')
X_train, y_train = load_dataset_csv(f'{path}/{list_files[1]}')
X_val, y_val = load_dataset_csv(f'{path}/{list_files[2]}')

print('Dataset size:', {'y_test': len(y_test), 'y_train': len(y_train), 'y_val': len(y_val)})

Dataset size: {'y_test': 4096, 'y_train': 51200, 'y_val': 4096}


In [5]:
MAX_LENGTH = max([len(x) for x in X_train] + [len(y) for y in y_train])

unique_chars = set()
[unique_chars.update(list(x)) for x in y_train]
unique_chars = list(unique_chars)
unique_chars.sort()

local_script = Script(language)
local_script.create_vocab(unique_chars)
print(local_script.inx2char)



unique_chars = set()
[unique_chars.update(list(x)) for x in X_train]
unique_chars = list(unique_chars)
unique_chars.sort()

latin_script = Script('latin')
latin_script.create_vocab(unique_chars)
print(latin_script.inx2char)


{0: '<', 1: '>', 2: 'ം', 3: 'ഃ', 4: 'അ', 5: 'ആ', 6: 'ഇ', 7: 'ഈ', 8: 'ഉ', 9: 'ഊ', 10: 'ഋ', 11: 'എ', 12: 'ഏ', 13: 'ഐ', 14: 'ഒ', 15: 'ഓ', 16: 'ഔ', 17: 'ക', 18: 'ഖ', 19: 'ഗ', 20: 'ഘ', 21: 'ങ', 22: 'ച', 23: 'ഛ', 24: 'ജ', 25: 'ഝ', 26: 'ഞ', 27: 'ട', 28: 'ഠ', 29: 'ഡ', 30: 'ഢ', 31: 'ണ', 32: 'ത', 33: 'ഥ', 34: 'ദ', 35: 'ധ', 36: 'ന', 37: 'പ', 38: 'ഫ', 39: 'ബ', 40: 'ഭ', 41: 'മ', 42: 'യ', 43: 'ര', 44: 'റ', 45: 'ല', 46: 'ള', 47: 'ഴ', 48: 'വ', 49: 'ശ', 50: 'ഷ', 51: 'സ', 52: 'ഹ', 53: 'ാ', 54: 'ി', 55: 'ീ', 56: 'ു', 57: 'ൂ', 58: 'ൃ', 59: 'െ', 60: 'േ', 61: 'ൈ', 62: 'ൊ', 63: 'ോ', 64: 'ൌ', 65: '്', 66: 'ൺ', 67: 'ൻ', 68: 'ർ', 69: 'ൽ', 70: 'ൾ'}
{0: '<', 1: '>', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: 'i', 11: 'j', 12: 'k', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'p', 18: 'q', 19: 'r', 20: 's', 21: 't', 22: 'u', 23: 'v', 24: 'w', 25: 'x', 26: 'y', 27: 'z'}


In [6]:
transliter_pairs_test = list(zip(X_test, y_test))
transliter_pairs_train = list(zip(X_train, y_train))
transliter_pairs_val = list(zip(X_val, y_val))

In [26]:
def get_dataloader(transliter_pairs, latin_script, local_script, batch_size=32):
    n = len(transliter_pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=int)
    output_ids = np.zeros((n, MAX_LENGTH), dtype=int)


    for idx, (latin, local) in enumerate(transliter_pairs):
        try:
            inp_ids = [latin_script.char2idx[c] for c in latin]
            out_ids = [local_script.char2idx[c] for c in local]
            input_ids[idx, :len(inp_ids)] = inp_ids
            output_ids[idx, :len(out_ids)] = out_ids
        except Exception as e:
            print(repr(e))
            

    
    

    dataset = torch.utils.data.TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(output_ids).to(device))
    sampler = torch.utils.data.RandomSampler(dataset)
    dataloader = torch.utils.data.DataLoader(dataset, sampler=sampler, batch_size=batch_size)
    return dataloader

In [27]:
dataloader_train = get_dataloader(transliter_pairs_train, latin_script, local_script, batch_size=32)
dataloader_test = get_dataloader(transliter_pairs_test, latin_script, local_script, batch_size=32)
dataloader_val = get_dataloader(transliter_pairs_val, latin_script, local_script, batch_size=32)

KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
ValueError('could not broadcast input array from shape (32,) into shape (31,)')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')
KeyError('ൗ')


In [28]:
class Encoder(torch.nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = torch.nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [29]:
class DecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = torch.nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(0)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = torch.nn.functional.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = torch.nn.functional.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [30]:
hidden_size = 128
batch_size=32
encoder = Encoder(input_size=latin_script.vocab_size, hidden_size=hidden_size, dropout_p=0).to(device)
decoder = DecoderRNN(hidden_size=hidden_size, output_size=local_script.vocab_size).to(device)



In [31]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, accuracy_criterion):

    total_loss = 0
    total_accuracy = torch.tensor([], dtype=torch.float32, device=device)
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()
        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()
        accuracy = accuracy_criterion(decoded_ids, target_tensor)
        total_accuracy = torch.cat((total_accuracy, accuracy))
        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader), total_accuracy

In [32]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

  plt.switch_backend('agg')


In [33]:
def compute_val_loss_accuracy(val_dataloader, criterion, encoder, decoder):
    total_loss = 0
    for data in val_dataloader:
        input_tensor, target_tensor = data

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        # calcluating accuracy
        


        total_loss += loss.item()

    return total_loss / len(val_dataloader)


In [34]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100, val_dataloader=None):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    print_accuracy_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = torch.nn.NLLLoss()
    accuracy_criterion = Accuracy(task='multiclass', num_classes=local_script.vocab_size, multidim_average='samplewise')


    for epoch in range(1, n_epochs + 1):
        loss, accuracy = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, accuracy_criterion)
        print_loss_total += loss
        plot_loss_total += loss
        print_accuracy_total += sum(accuracy ==1)/len(accuracy)

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_accuracy_avg = print_accuracy_total / print_every
            print_accuracy_total = 0
            print_loss_total = 0
            print('%s (%d %d%%) Loss: %.4f Acc: %.2f %%' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg, print_accuracy_avg*100))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)
    return plot_losses

In [41]:
loss = train(dataloader_val, encoder, decoder, 50, print_every=1, plot_every=1)

# for i in range(1000):
#     losss = train_epoch(dataloader, encoder, decoder, torch.optim.Adam(encoder.parameters()), torch.optim.Adam(decoder.parameters()), torch.nn.NLLLoss())
#     print(losss)

0m 5s (- 4m 49s) (1 2%) Loss: 0.6391 Acc: 2.25 %
0m 11s (- 4m 26s) (2 4%) Loss: 0.5105 Acc: 3.76 %
0m 16s (- 4m 18s) (3 6%) Loss: 0.4535 Acc: 5.03 %
0m 22s (- 4m 18s) (4 8%) Loss: 0.4108 Acc: 6.45 %
0m 29s (- 4m 29s) (5 10%) Loss: 0.3838 Acc: 8.30 %
0m 36s (- 4m 31s) (6 12%) Loss: 0.3593 Acc: 9.94 %
0m 43s (- 4m 27s) (7 14%) Loss: 0.3399 Acc: 10.96 %
0m 49s (- 4m 17s) (8 16%) Loss: 0.3242 Acc: 13.18 %
0m 54s (- 4m 9s) (9 18%) Loss: 0.3112 Acc: 14.11 %
1m 0s (- 4m 0s) (10 20%) Loss: 0.2985 Acc: 14.77 %
1m 5s (- 3m 51s) (11 22%) Loss: 0.2896 Acc: 15.70 %
1m 10s (- 3m 42s) (12 24%) Loss: 0.2767 Acc: 17.90 %
1m 15s (- 3m 35s) (13 26%) Loss: 0.2677 Acc: 18.07 %
1m 20s (- 3m 27s) (14 28%) Loss: 0.2609 Acc: 18.73 %
1m 26s (- 3m 20s) (15 30%) Loss: 0.2557 Acc: 19.41 %
1m 31s (- 3m 14s) (16 32%) Loss: 0.2495 Acc: 21.22 %
1m 36s (- 3m 7s) (17 34%) Loss: 0.2388 Acc: 22.49 %
1m 42s (- 3m 1s) (18 36%) Loss: 0.2324 Acc: 22.61 %
1m 47s (- 2m 55s) (19 38%) Loss: 0.2297 Acc: 23.14 %
1m 52s (- 2m 49s) (

In [44]:
test_data = iter(dataloader_val)

In [45]:

def convert_tensor_to_string(tensor, script):
    words = []
    for idx in tensor:
        word = []
        for i in idx:
            word.append(script.inx2char[i.item()])
            if i.item() == script.char2idx[END]:
                break
        words.append(''.join(word))
    return words

input_tensor, target_tensor = next(test_data)
encoder_outputs, encoder_hidden = encoder(input_tensor)
decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden)

_, topi = decoder_outputs.topk(1)
decoded_ids = topi.squeeze()

input_words, output_words = convert_tensor_to_string(input_tensor, latin_script), convert_tensor_to_string(decoded_ids, local_script)
expected_words = convert_tensor_to_string(target_tensor, local_script)

print('Input:', input_words)
print('Expected:', expected_words)
print('Predicted:', output_words)

matched_words = set(expected_words) & set(output_words)
print('Accuracy:  ', len(matched_words)/ len(expected_words))
print('Matched: ', matched_words)

Input: ['<bite>', '<economicsil>', '<chuttiyulla>', '<thuvvoor>', '<kanamenna>', '<aashupathriyakk>', '<platto>', '<vayattil>', '<libraryyil>', '<parichunakkiya>', '<grandhathile>', '<ne>', '<ulppedunnathaanithu>', '<baraakk>', '<oven>', '<eraakhile>', '<hanumaanum>', '<chinayil>', '<laantilum>', '<elaktro>', '<vyathyasthathakalaanu>', '<praathinidhya>', '<variyaayi>', '<sahanasamaravum>', '<vistheernam>', '<kazhiyunnavarille>', '<paass>', '<leghayude>', '<mathsaramaayirunnu>', '<sanghadippikkaan>', '<kongressile>', '<aalathur>']
Expected: ['<ബൈറ്റ്>', '<ഇക്കണോമിക്സിൽ>', '<ചുറ്റിയുള്ള>', '<തുവ്വൂർ>', '<കാണാമെന്ന>', '<ആശുപത്രിയക്ക്>', '<പ്ലേറ്റോ>', '<വയറ്റിൽ>', '<ലൈബ്രറിയിൽ>', '<പറിച്ചുണക്കിയ>', '<ഗ്രന്ഥത്തിലെ>', '<നേ>', '<ഉൾപ്പെടുന്നതാണിത്>', '<ബരാക്ക്>', '<ഓവൻ>', '<ഇറാഖിലെ>', '<ഹനുമാനും>', '<ചൈനയിൽ>', '<ലാന്റിലും>', '<ഇലക്ട്രോ>', '<വ്യത്യസ്ഥതകളാണ്>', '<പ്രാതിനിധ്യ>', '<വരിയായി>', '<സഹനസമരവും>', '<വിസ്തീർണം>', '<കഴിയുന്നവരില്ലേ>', '<പാസ്സ്>', '<ലേഖയുടെ>', '<മത്സരമായിരുന്നു>', '<സംഘടിപ്

### Now with attention

In [46]:
class BahdanauAttention(torch.nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = torch.nn.Linear(hidden_size, hidden_size)
        self.Ua = torch.nn.Linear(hidden_size, hidden_size)
        self.Va = torch.nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = torch.nn.functional.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = torch.nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.dropout = torch.nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(0)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = torch.nn.functional.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [47]:
encoder = Encoder(input_size=latin_script.vocab_size, hidden_size=hidden_size, dropout_p=0).to(device)
attn_decoder = AttnDecoderRNN(hidden_size=hidden_size, output_size=local_script.vocab_size).to(device)

In [48]:
train(dataloader_val, encoder, attn_decoder, 10, print_every=1, plot_every=1, val_dataloader=dataloader_val)

0m 11s (- 1m 46s) (1 10%) Loss: 1.2378 Acc: 0.71 %
0m 23s (- 1m 33s) (2 20%) Loss: 0.8515 Acc: 0.71 %
0m 34s (- 1m 21s) (3 30%) Loss: 0.6889 Acc: 0.76 %
0m 46s (- 1m 10s) (4 40%) Loss: 0.4615 Acc: 3.34 %
0m 58s (- 0m 58s) (5 50%) Loss: 0.2854 Acc: 10.42 %
1m 9s (- 0m 46s) (6 60%) Loss: 0.2007 Acc: 19.65 %
1m 20s (- 0m 34s) (7 70%) Loss: 0.1592 Acc: 26.56 %
1m 31s (- 0m 22s) (8 80%) Loss: 0.1360 Acc: 32.18 %
1m 43s (- 0m 11s) (9 90%) Loss: 0.1163 Acc: 36.60 %
1m 54s (- 0m 0s) (10 100%) Loss: 0.1015 Acc: 41.46 %


[1.2377742053940892,
 0.8515051216818392,
 0.688946230802685,
 0.461546499049291,
 0.2854411626467481,
 0.2007488302187994,
 0.15915535757085308,
 0.1360222269431688,
 0.116289580357261,
 0.10148271830985323]

In [49]:

def convert_tensor_to_string(tensor, script):
    words = []
    for idx in tensor:
        word = []
        for i in idx:
            word.append(script.inx2char[i.item()])
            if i.item() == script.char2idx[END]:
                break
        words.append(''.join(word))
    return words

input_tensor, target_tensor = next(test_data)
encoder_outputs, encoder_hidden = encoder(input_tensor)
decoder_outputs, _, _ = attn_decoder(encoder_outputs, encoder_hidden)

_, topi = decoder_outputs.topk(1)
decoded_ids = topi.squeeze()

input_words, output_words = convert_tensor_to_string(input_tensor, latin_script), convert_tensor_to_string(decoded_ids, local_script)
expected_words = convert_tensor_to_string(target_tensor, local_script)

print('Input:', input_words)
print('Expected:', expected_words)
print('Predicted:', output_words)

matched_words = set(expected_words) & set(output_words)
print('Accuracy:  ', len(matched_words)/ len(expected_words))
print('Matched: ', matched_words)

Input: ['<gavanmentinu>', '<chikithsichumaattaam>', '<sadanam>', '<varikalaanu>', '<kombukalum>', '<kanjangad>', '<maidrid>', '<jorj>', '<manthrisabhaayogathine>', '<swadeshiyum>', '<anusarichum>', '<aswasthathayoyaanu>', '<faashisttaaya>', '<karnaadakathilum>', '<auak>', '<bora>', '<aashankaanimisham>', '<ruupappedunnathu>', '<chinayilum>', '<panjcha>', '<grafic>', '<vimanangale>', '<samarthamaaya>', '<dikkil>', '<adukkurippodeyaanu>', '<midhyayo>', '<thudarendennu>', '<aadyathil>', '<pappanum>', '<aduthethaanaakoo>', '<raashtreeyabhethamanye>', '<indirayude>']
Expected: ['<ഗവൺമെന്റിന്>', '<ചികിത്സിച്ചുമാറ്റാം>', '<സദനം>', '<വരികളാണ്>', '<കൊമ്പുകളും>', '<കാഞ്ഞങ്ങാട്>', '<മാഡ്രിഡ്>', '<ജോര്ജ്>', '<മന്ത്രിസഭായോഗത്തിനേ>', '<സ്വദേശിയും>', '<അനുസരിച്ചും>', '<അസ്വസ്ഥതയോയാണ്>', '<ഫാഷിസ്റ്റായ>', '<കർണാടകത്തിലും>', '<ഓക്ക്>', '<ബോറ>', '<ആശങ്കാനിമിഷം>', '<രൂപപ്പെടുന്നത്>', '<ചൈനയിലും>', '<പഞ്ച>', '<ഗ്രാഫിക്>', '<വിമാനങ്ങളെ>', '<സമർഥമായ>', '<ദിക്കിൽ>', '<അടുക്കുറിപ്പോടെയാണ്>', '<മിഥ്യയോ>', '<തുട