# Preamble

Most code is taken from/adapted from: https://docs.pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html#training-and-evaluating

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Load Data
Data is from MCO2 (parallel corpora)

In [2]:
parallel_corpora = pd.read_csv('parallel_corpora.csv')

In [3]:
parallel_corpora.head(5)

Unnamed: 0.1,Unnamed: 0,language1,language2,book,chapter,verse,language1_text,language2_text
0,0,bantoanon,bicol,GEN,1,1,"Sa kauna-unahan, gingtuga it Dios kag langit a...","Sa kapinonan, kan lalangon nin Dios an kalangi..."
1,1,bantoanon,bicol,GEN,1,2,"It kato, kag kalibutan ay waya it korte ag way...",an kinaban mayo nin ano man na porma saka laog...
2,2,bantoanon,bicol,GEN,1,3,"Ag nagsiling kag Dios, ""Magkainggwa't hadag!"" ...","Nagboot an Dios, ""Magkaigwa nin liwanag,"" asin..."
3,3,bantoanon,bicol,GEN,1,4,"Nakita it Dios nak maado kag hadag, ag Ida ing...",Nahiling nin Dios na marahay an liwanag. Isinu...
4,4,bantoanon,bicol,GEN,1,5,"Gingtawag it Dios kag hadag nak ""adlaw"" ag kag...","dangan inapod niyang ""Aldaw"" an liwanag, asin ..."


In [4]:
sentence_to_words = (lambda x: str(x)
                                 .replace(',', ' ')
                                 .replace('.', ' ')
                                 .replace('\'', ' ')
                                 .replace('-', ' ')
                                 .replace('"', ' ')
                                 .replace('!', ' ')
                                 .replace(';', ' ')
                                 .replace('-', ' ')
                                 .replace('&', ' ')
                                 .replace('$', ' ')
                                 .replace('?', ' ')
                                 .replace('(', ' ')
                                 .replace(')', ' ')
                                 .replace('[', ' ')
                                 .replace(']', ' ')
                                 .replace('*', ' ')
                                 .replace('«', ' ')
                                 .replace('»', ' ')
                                 .replace('¡', ' ')
                                 .replace('¿', ' ')
                                 .replace('/', ' ')
                                 .replace('~', ' ')
                                 .replace('=', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('\r\n', '\n')
                                 .replace('\n', ' ')
                                 .strip()
                                 .split(' '))

parallel_corpora['language1_text_words'] = parallel_corpora['language1_text'].apply(sentence_to_words)
parallel_corpora['language2_text_words'] = parallel_corpora['language2_text'].apply(sentence_to_words)

# Words Class
Based on PyTorch article linked above, this stores all of the words in 

In [5]:
start_token = 0
end_token = 1

class Language:
    def __init__(self):
        self.word_index_map = {'__SOS__': 0, '__EOS__': 1}
        self.index_word_map = {0: '__SOS__', 1: '__EOS__'}
        self.word_counts = {}
    
    def add_word(self, word: str):
        if ' ' in word or '\n' in word:
            raise Exception('Word contains whitespace!')
        
        if word not in self.word_index_map:
            index = len(self.word_index_map)
            self.word_index_map[word] = index
            self.index_word_map[index] = word
            self.word_counts[word] = 1
        else:
            self.word_counts[word] += 1

    def add_sentence(self, sentence: str):
        #for word in sentence.replace('\r\n', '\n').replace('\n', ' ').split(' '):
        for word in sentence_to_words(sentence):
            self.add_word(word)
    
    def indices_from_sentence(self, sentence: str):
        output = []
        #for word in sentence.replace('\r\n', '\n').replace('\n', ' ').split(' '):
        for word in sentence_to_words(sentence):
            output.append(self.word_index_map[word])
        print(output)
        return output
    
    def indices_from_sentence_array(self, sentence: list[str]):
        output = []
        for word in sentence:
            output.append(self.word_index_map[word])
        return output
    
    def tensors_from_sentence(self, sentence: str):
        indices = self.indices_from_sentence(sentence)
        indices.append(end_token)
        return torch.tensor(indices, dtype=torch.long, device=device).view(1, -1)
    
    def tensors_from_sentence_array(self, sentence: list[str]):
        indices = self.indices_from_sentence_array(sentence)
        indices.append(end_token)
        return torch.tensor(indices, dtype=torch.long, device=device).view(1, -1)

def create_language_tensor_pair(left_language: Language, right_language: Language, left_sentence: str, right_sentence: str):
    return (left_language.tensors_from_sentence(left_sentence), right_language.tensors_from_sentence(right_sentence))

def prepare_languages():
    languages = {}
    for language in parallel_corpora['language1'].unique():
        languages[language] = Language()
        left_texts = parallel_corpora.loc[parallel_corpora['language1'] == language]['language1_text']
        right_texts = parallel_corpora.loc[parallel_corpora['language1'] == language]['language2_text']
        sentences = np.unique(np.concat([left_texts.values, right_texts.values]))
        for sentence in sentences:
            languages[language].add_sentence(sentence)
    return languages

In [6]:
languages = prepare_languages()

In [7]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [8]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, max_length):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.max_length = max_length

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(start_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(self.max_length):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [9]:
parallel_corpora['language1_text_word_count'] = parallel_corpora['language1_text_words'].apply(len)
parallel_corpora['language2_text_word_count'] = parallel_corpora['language2_text_words'].apply(len)

parallel_corpora.sort_values(by=['language1_text_word_count', 'language2_text_word_count'], ascending=False)

Unnamed: 0.1,Unnamed: 0,language1,language2,book,chapter,verse,language1_text,language2_text,language1_text_words,language2_text_words,language1_text_word_count,language2_text_word_count
188434,188434,bicol,cebuano,NUM,7,12-83,Ini an orden kan aroaldaw nindang pagdara kan ...,Ang nagdala sa iyang halad sa unang adlaw mao ...,"[Ini, an, orden, kan, aroaldaw, nindang, pagda...","[Ang, nagdala, sa, iyang, halad, sa, unang, ad...",22392,1951
281130,281130,bicol,ilokano,NUM,7,12-83,Ini an orden kan aroaldaw nindang pagdara kan ...,Ket ti nangidatag iti datonna iti immuna nga a...,"[Ini, an, orden, kan, aroaldaw, nindang, pagda...","[Ket, ti, nangidatag, iti, datonna, iti, immun...",22392,1899
309418,309418,bicol,ilonggo,NUM,7,12-83,Ini an orden kan aroaldaw nindang pagdara kan ...,Ang nagdolot sang iya halad sa nahauna nga adl...,"[Ini, an, orden, kan, aroaldaw, nindang, pagda...","[Ang, nagdolot, sang, iya, halad, sa, nahauna,...",22392,1824
370329,370329,bicol,pampanga,NUM,7,12-83,Ini an orden kan aroaldaw nindang pagdara kan ...,"King mumunang aldo i Nason, ing anak nang Amin...","[Ini, an, orden, kan, aroaldaw, nindang, pagda...","[King, mumunang, aldo, i, Nason, ing, anak, na...",22392,1737
252944,252944,bicol,filipino,NUM,7,12-83,Ini an orden kan aroaldaw nindang pagdara kan ...,At ang naghandog ng kanyang alay nang unang ar...,"[Ini, an, orden, kan, aroaldaw, nindang, pagda...","[At, ang, naghandog, ng, kanyang, alay, nang, ...",22392,1719
...,...,...,...,...,...,...,...,...,...,...,...,...
2415670,2415670,pangasinan,waray,1CH,6,5,"Buki, Uzi,","Boci, Uzi,","[Buki, Uzi]","[Boci, Uzi]",2,2
2415671,2415671,pangasinan,waray,1CH,6,6,"Zeraias, Meraiot,","Saraias, Merayot,","[Zeraias, Meraiot]","[Saraias, Merayot]",2,2
2415672,2415672,pangasinan,waray,1CH,6,7,"Amarias, Ahitob,","Amarias, Aguitob,","[Amarias, Ahitob]","[Amarias, Aguitob]",2,2
2415673,2415673,pangasinan,waray,1CH,6,8,"Sadoc, Ahimaaz,","Sadoc, Aquimas,","[Sadoc, Ahimaaz]","[Sadoc, Aquimas]",2,2


In [41]:
#max_length = max(
#    pairs['language1_text'].apply(lambda x: x.count(' ') + 1).max(),
#    pairs['language2_text'].apply(lambda x: x.count(' ') + 1).max()
#)
max_length = 200

In [42]:
pairs = parallel_corpora.loc[
    (parallel_corpora['language1'] == 'english')
      & (parallel_corpora['language2'] == 'filipino')
      & (parallel_corpora['language1_text_word_count'] <= max_length - 1)
      & (parallel_corpora['language2_text_word_count'] <= max_length)
    ][['language1_text', 'language2_text']]

In [38]:
pairs.head(5)

Unnamed: 0,language1_text,language2_text
965446,In the beginning God created the heaven and th...,"Nang pasimula, nilikha ng Diyos ang langit at ..."
965447,"And the earth was without form, and void; and ...","Ang lupa ay walang anyo at walang laman, at bi..."
965448,"And God said, Let there be light: and there wa...","At sinabi ng Diyos, ""Magkaroon ng liwanag,"" at..."
965449,"And God saw the light, that it was good: and G...","Nakita ng Diyos na ang liwanag ay mabuti, at i..."
965450,"And God called the light Day, and the darkness...","Tinawag ng Diyos ang liwanag na Araw, at ang k..."


In [43]:
len(pairs)

29184

In [14]:
n = len(pairs)
input_ids = np.zeros((n, max_length), dtype=np.int32)
target_ids = np.zeros((n, max_length), dtype=np.int32)

In [15]:
for idx, tup in enumerate(pairs.itertuples()):
    inp_ids = languages['english'].indices_from_sentence(tup.language1_text)
    tgt_ids = languages['filipino'].indices_from_sentence(tup.language2_text)
    inp_ids.append(end_token)
    tgt_ids.append(end_token)
    input_ids[idx, :len(inp_ids)] = inp_ids
    target_ids[idx, :len(tgt_ids)] = tgt_ids

[4194, 29678, 32706, 29879, 59916, 29678, 39917, 29712, 29678, 30108]
[12110, 5673, 17057, 113, 157, 4, 270, 126, 4, 417]
[29702, 29879, 29745, 39958, 30022, 28436, 59982, 29712, 30022, 29705, 33099]
[135, 729, 113, 157, 19725, 113, 2499, 126, 16848, 113, 2499]
[29702, 29879, 22056, 29678, 33099, 29719, 3691, 29705, 59988, 29712, 29879, 36488, 29678, 33099, 29723, 29678, 33097]
[3806, 113, 157, 43, 4, 2499, 439, 2952, 126, 58660, 113, 157, 4, 2499, 13, 6162]
[29702, 29879, 29885, 29678, 59923, 59924, 29702, 29678, 59688, 29712, 29678, 59046, 30021, 29678, 33439, 3285]
[26759, 113, 157, 4, 1859, 43, 3748, 44650, 113, 2303, 126, 16848, 113, 2814, 580, 7650, 438]
[29702, 29678, 59688, 29712, 29678, 59046, 30021, 29678, 34178, 3285]
[44650, 113, 2303, 126, 16848, 113, 2814, 580, 7652, 438]
[29702, 29879, 32575, 29676, 90, 29678, 59923, 29680, 29678, 39917, 76, 33770, 33099, 30107, 29678, 30108]
[2450, 114, 592, 4, 897, 113, 157, 13, 1859, 113, 270, 437, 61821, 13, 337, 113, 417]
[29702, 29

In [16]:
train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

In [17]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [18]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [20]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [19]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [21]:
def evaluate(encoder, decoder, sentence, input_lang: Language, output_lang: Language):
    with torch.no_grad():
        input_tensor = input_lang.tensors_from_sentence(sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == end_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index_word_map[idx.item()])
    return decoded_words, decoder_attn

In [22]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs.values)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], languages['english'], languages['filipino'])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [23]:
random.choice(pairs.values)

array(['shouldest not thou also have had compassion on thy fellowservant, even as I had pity on thee?',
       "Hindi ba dapat kang nahabag sa iyong kapwa alipin, kung paanong nahabag ako sa iyo?'"],
      dtype=object)

In [25]:
hidden_size = 128
batch_size = 32

#print(len(languages['english'].index_word_map), len(languages['filipino'].index_word_map))

encoder = EncoderRNN(len(languages['english'].index_word_map) - 2, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, len(languages['filipino'].index_word_map) - 2, max_length).to(device)
     
print(len(pairs))
train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

8718
2m 47s (- 41m 49s) (5 6%) 4.1196
5m 38s (- 39m 28s) (10 12%) 2.9331
8m 27s (- 36m 40s) (15 18%) 2.3573
11m 16s (- 33m 50s) (20 25%) 1.9372
14m 5s (- 30m 59s) (25 31%) 1.6199
16m 53s (- 28m 9s) (30 37%) 1.3800
19m 42s (- 25m 20s) (35 43%) 1.1962
22m 31s (- 22m 31s) (40 50%) 1.0502
25m 19s (- 19m 41s) (45 56%) 0.9330
28m 8s (- 16m 52s) (50 62%) 0.8365
30m 56s (- 14m 3s) (55 68%) 0.7538
33m 45s (- 11m 15s) (60 75%) 0.6853
36m 33s (- 8m 26s) (65 81%) 0.6245
39m 22s (- 5m 37s) (70 87%) 0.5724
42m 10s (- 2m 48s) (75 93%) 0.5270
44m 59s (- 0m 0s) (80 100%) 0.4854


In [26]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> I have seen the travail, which God hath given to the sons of men to be exercised in it. 
= Aking nakita ang gawain na ibinigay ng Diyos sa mga anak ng mga tao upang pagkaabalahan.
[29739, 30017, 30097, 29678, 34454, 29926, 29879, 29749, 29821, 76, 29678, 39905, 29680, 30091, 76, 28436, 60200, 90, 3691]
< Alam kong ng mga magulang sa lahat ng mga diyos ang mga iyon ayon sa akin ang mga lalaki ng Panginoon

> Let not thine heart envy sinners: But be thou in the fear of the LORD all the day long.
= Huwag mainggit ang iyong puso sa mga makasalanan, kundi magpatuloy ka sa takot sa Panginoon sa buong araw.
[39958, 29932, 41505, 32603, 34047, 117990, 29924, 28436, 33407, 90, 29678, 34042, 29680, 29678, 29716, 30007, 29678, 3285, 12895]
< Huwag mainggit ng Panginoon ang kanyang bayan at hindi ka matatakot sa harapan ng Panginoon <EOS>

> and Hananiah, and Elam, and Antothijah, 
= Hananias, Belam, Antotias;
[29712, 39730, 29712, 1072, 29712, 243556]
< ang kulay libong libong Zacarias <EOS>

>

In [31]:
output_words, _ = evaluate(encoder, decoder, "The LORD said", languages['english'], languages['filipino'])
output_sentence = ' '.join(output_words)
output_sentence

[30897, 29716, 29745]


'Ganito ang lumalapit sa Panginoon ng pitumpung libong awit <EOS>'