In [1]:
import torch
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
class Script:
    def __init__(self, script_name):
        self.script_name = script_name
        self.char2idx = {}
        self.inx2char = {}
        self.vocab_size = 0

    def create_vocab(self, char_list):
        for i, char in enumerate(char_list):
            self.char2idx[char] = i
            self.inx2char[i] = char
        self.vocab_size = len(char_list)
    
    def add_char(self, char):
        if char not in self.char2idx:
            self.char2idx[char] = self.vocab_size
            self.inx2char[self.vocab_size] = char
            self.vocab_size += 1
        else:
            print("Character already exists in the script")



In [3]:
import os
dataset_name = "aksharantar_sampled"
languages_dataset = os.listdir(dataset_name)
print(languages_dataset)

['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'kas', 'kok', 'mai', 'mal', 'mar', 'mni', 'ori', 'pan', 'san', 'sid', 'tam', 'tel', 'urd']


In [4]:
MAX_LENGTH=10
language = 'kan'
START='<'
END='>'
def load_dataset_csv(path):
    X, y = [], []
    with open(path, 'r', encoding='UTF-8') as f:
        for line in f:
            line = line.strip().split(',')
            X.append(f'{START}{line[0]}{END}')
            y.append(f'{START}{line[1]}{END}')
    
    return X, y

list_files = os.listdir(f'{dataset_name}/{language}')
path = f'{dataset_name}/{language}'
X_test, y_test = load_dataset_csv(f'{path}/{list_files[0]}')


In [5]:
MAX_LENGTH = max([len(x) for x in X_test] + [len(y) for y in y_test])

unique_chars = set()
[unique_chars.update(list(x)) for x in y_test]
unique_chars = list(unique_chars)
unique_chars.sort()

local_script = Script(language)
local_script.create_vocab(unique_chars)
local_script.inx2char

unique_chars = set()
[unique_chars.update(list(x)) for x in X_test]
unique_chars = list(unique_chars)
unique_chars.sort()

latin_script = Script('latin')
latin_script.create_vocab(unique_chars)
# latin_script.inx2char


In [6]:
transliter_pairs = list(zip(X_test, y_test))

In [7]:
def get_dataloader(transliter_pairs, latin_script, local_script, batch_size=32):
    n = len(transliter_pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=int)
    output_ids = np.zeros((n, MAX_LENGTH), dtype=int)


    for idx, (latin, local) in enumerate(transliter_pairs):
        inp_ids = [latin_script.char2idx[c] for c in latin]
        out_ids = [local_script.char2idx[c] for c in local]
        input_ids[idx, :len(inp_ids)] = inp_ids
        output_ids[idx, :len(out_ids)] = out_ids

    
    

    dataset = torch.utils.data.TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(output_ids).to(device))
    sampler = torch.utils.data.RandomSampler(dataset)
    dataloader = torch.utils.data.DataLoader(dataset, sampler=sampler, batch_size=batch_size)
    return dataloader

In [8]:
dataloader = get_dataloader(transliter_pairs, latin_script, local_script, batch_size=32)

In [9]:
class Encoder(torch.nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = torch.nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [10]:
class DecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = torch.nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(0)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = torch.nn.functional.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = torch.nn.functional.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [11]:
hidden_size = 128
batch_size=32
encoder = Encoder(latin_script.vocab_size, hidden_size, dropout_p=0).to(device)
decoder = DecoderRNN(hidden_size, local_script.vocab_size).to(device)



In [13]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [22]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [23]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = torch.nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [25]:
train(dataloader, encoder, decoder, 100, print_every=1, plot_every=1)

# for i in range(1000):
#     losss = train_epoch(dataloader, encoder, decoder, torch.optim.Adam(encoder.parameters()), torch.optim.Adam(decoder.parameters()), torch.nn.NLLLoss())
#     print(losss)

0m 3s (- 5m 24s) (1 1%) 0.0248
0m 6s (- 5m 10s) (2 2%) 0.0120
0m 9s (- 5m 3s) (3 3%) 0.0099
0m 12s (- 4m 59s) (4 4%) 0.0121
0m 15s (- 4m 55s) (5 5%) 0.0584
0m 18s (- 4m 52s) (6 6%) 0.0675
0m 21s (- 4m 49s) (7 7%) 0.0245
0m 24s (- 4m 46s) (8 8%) 0.0117
0m 28s (- 4m 43s) (9 9%) 0.0079
0m 31s (- 4m 39s) (10 10%) 0.0065
0m 34s (- 4m 36s) (11 11%) 0.0057
0m 37s (- 4m 33s) (12 12%) 0.0054
0m 40s (- 4m 29s) (13 13%) 0.0051
0m 43s (- 4m 26s) (14 14%) 0.0048
0m 46s (- 4m 23s) (15 15%) 0.0047
0m 49s (- 4m 20s) (16 16%) 0.0046
0m 52s (- 4m 17s) (17 17%) 0.0153
0m 55s (- 4m 13s) (18 18%) 0.1508
0m 58s (- 4m 10s) (19 19%) 0.0717
1m 1s (- 4m 7s) (20 20%) 0.0321
1m 4s (- 4m 4s) (21 21%) 0.0166
1m 8s (- 4m 1s) (22 22%) 0.0095
1m 11s (- 3m 58s) (23 23%) 0.0067
1m 14s (- 3m 55s) (24 24%) 0.0057
1m 17s (- 3m 52s) (25 25%) 0.0051
1m 20s (- 3m 48s) (26 26%) 0.0049
1m 23s (- 3m 45s) (27 27%) 0.0046
1m 26s (- 3m 42s) (28 28%) 0.0043
1m 29s (- 3m 39s) (29 28%) 0.0041
1m 32s (- 3m 36s) (30 30%) 0.0040
1m 35s (

In [19]:
encoder_outputs, encoder_hidden = encoder(input_tensor)
decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden, target_tensor)



_, topi = decoder_outputs.topk(1)
decoded_ids = topi.squeeze()

decoded_input = []
for inp in input_tensor:
    for idx in inp:
        if idx.item() == END:
            break
        decoded_input.append(latin_script.inx2char[idx.item()])

print('Input:',''.join(decoded_input))

decoded_words = []
for idx in decoded_ids:
    for i in idx:
        if i.item() == END:
            break
        decoded_words.append(local_script.inx2char[i.item()])
    # if idx.item() == END:
    #     decoded_words.append('<EOS>')
    #     break
    # decoded_words.append(local_script.inx2char[idx.item()])
print('Predicted:',''.join(decoded_words))

actual_word =  []
for lbl in target_tensor:
    for i in lbl:
        if i.item() == END:
            break
        actual_word.append(local_script.inx2char[i.item()])

print('Label:', ''.join(actual_word))

Input: <meelmaige><<<<<<<<<<<<<<<<<<<mudisiddare><<<<<<<<<<<<<<<<<ganithashatradalli><<<<<<<<<<prastaavaneyannu><<<<<<<<<<<<beleh><<<<<<<<<<<<<<<<<<<<<<<beledaaga><<<<<<<<<<<<<<<<<<<sitrik><<<<<<<<<<<<<<<<<<<<<<badalaayisikondiddarinda><<<<nindisy><<<<<<<<<<<<<<<<<<<<<modhalindha><<<<<<<<<<<<<<<<<lot><<<<<<<<<<<<<<<<<<<<<<<<<vaasaviddaru><<<<<<<<<<<<<<<<rub><<<<<<<<<<<<<<<<<<<<<<<<<heggalikeyendae><<<<<<<<<<<<<vishvavidyanilayadalli><<<<<<vishvavyapiyagi><<<<<<<<<<<<<kalalu><<<<<<<<<<<<<<<<<<<<<<worldwide><<<<<<<<<<<<<<<<<<<enilla><<<<<<<<<<<<<<<<<<<<<<ariyada><<<<<<<<<<<<<<<<<<<<<jategoodi><<<<<<<<<<<<<<<<<<<vikary><<<<<<<<<<<<<<<<<<<<<<horaduttaliddevu><<<<<<<<<<<<gidamoolike><<<<<<<<<<<<<<<<<saamaanyagolisuvudakke><<<<<<pattiyalliruvudaralli><<<<<<<kurudara><<<<<<<<<<<<<<<<<<<<svacchagolisalendu><<<<<<<<<<isuzu><<<<<<<<<<<<<<<<<<<<<<<ed><<<<<<<<<<<<<<<<<<<<<<<<<<janapratinidhigalu><<<<<<<<<<embante><<<<<<<<<<<<<<<<<<<<
Predicted: <ಮೇಲ್ಮೈಗೆ><<<<<<<<<<<<<<<<<<<<ಮೂಡಿಸಿದ್ದಾರೆ><<<<<<<<<<

In [130]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')