# Preamble

Most code is taken from/adapted from: https://docs.pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html#training-and-evaluating

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Load Data
Data is from MCO2 (parallel corpora)

In [2]:
parallel_corpora = pd.read_csv('parallel_corpora.csv')
parallel_corpora = parallel_corpora.loc[
    ((parallel_corpora['language1'] == 'english') 
    & (parallel_corpora['language2'] == 'filipino')) 
    | ((parallel_corpora['language2'] == 'english') 
    & (parallel_corpora['language1'] == 'filipino')) 
    ]

In [3]:
parallel_corpora.head(5)

Unnamed: 0.1,Unnamed: 0,language1,language2,book,chapter,verse,language1_text,language2_text
965446,965446,english,filipino,GEN,1,1,In the beginning God created the heaven and th...,"Nang pasimula, nilikha ng Diyos ang langit at ..."
965447,965447,english,filipino,GEN,1,2,"And the earth was without form, and void; and ...","Ang lupa ay walang anyo at walang laman, at bi..."
965448,965448,english,filipino,GEN,1,3,"And God said, Let there be light: and there wa...","At sinabi ng Diyos, ""Magkaroon ng liwanag,"" at..."
965449,965449,english,filipino,GEN,1,4,"And God saw the light, that it was good: and G...","Nakita ng Diyos na ang liwanag ay mabuti, at i..."
965450,965450,english,filipino,GEN,1,5,"And God called the light Day, and the darkness...","Tinawag ng Diyos ang liwanag na Araw, at ang k..."


In [4]:
sentence_to_words = (lambda x: str(x)
                                 .replace(',', ' ')
                                 .replace('.', ' ')
                                 .replace('\'', ' ')
                                 .replace('-', ' ')
                                 .replace('"', ' ')
                                 .replace('!', ' ')
                                 .replace(';', ' ')
                                 .replace('-', ' ')
                                 .replace('&', ' ')
                                 .replace('$', ' ')
                                 .replace('?', ' ')
                                 .replace('(', ' ')
                                 .replace(')', ' ')
                                 .replace('[', ' ')
                                 .replace(']', ' ')
                                 .replace('*', ' ')
                                 .replace('«', ' ')
                                 .replace('»', ' ')
                                 .replace('¡', ' ')
                                 .replace('¿', ' ')
                                 .replace('/', ' ')
                                 .replace('~', ' ')
                                 .replace('=', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('  ', ' ')
                                 .replace('\r\n', '\n')
                                 .replace('\n', ' ')
                                 .strip()
                                 .split(' '))

parallel_corpora['language1_text_words'] = parallel_corpora['language1_text'].apply(sentence_to_words)
parallel_corpora['language2_text_words'] = parallel_corpora['language2_text'].apply(sentence_to_words)

# Words Class
Based on PyTorch article linked above, this stores all of the words in 

In [5]:
start_token = 0
end_token = 1

class Language:
    def __init__(self):
        self.word_index_map = {'__SOS__': 0, '__EOS__': 1}
        self.index_word_map = {0: '__SOS__', 1: '__EOS__'}
        self.word_counts = {}
    
    def add_word(self, word: str):
        if ' ' in word or '\n' in word:
            raise Exception('Word contains whitespace!')
        
        if word not in self.word_index_map:
            index = len(self.word_index_map)
            self.word_index_map[word] = index
            self.index_word_map[index] = word
            self.word_counts[word] = 1
        else:
            self.word_counts[word] += 1

    def add_sentence(self, sentence: str):
        #for word in sentence.replace('\r\n', '\n').replace('\n', ' ').split(' '):
        for word in sentence_to_words(sentence):
            self.add_word(word)
    
    def indices_from_sentence(self, sentence: str):
        output = []
        #for word in sentence.replace('\r\n', '\n').replace('\n', ' ').split(' '):
        for word in sentence_to_words(sentence):
            output.append(self.word_index_map[word])
        return output
    
    def indices_from_sentence_array(self, sentence: list[str]):
        output = []
        for word in sentence:
            output.append(self.word_index_map[word])
        return output
    
    def tensors_from_sentence(self, sentence: str):
        indices = self.indices_from_sentence(sentence)
        indices.append(end_token)
        return torch.tensor(indices, dtype=torch.long, device=device).view(1, -1)
    
    def tensors_from_sentence_array(self, sentence: list[str]):
        indices = self.indices_from_sentence_array(sentence)
        indices.append(end_token)
        return torch.tensor(indices, dtype=torch.long, device=device).view(1, -1)

def create_language_tensor_pair(left_language: Language, right_language: Language, left_sentence: str, right_sentence: str):
    return (left_language.tensors_from_sentence(left_sentence), right_language.tensors_from_sentence(right_sentence))

def prepare_languages():
    languages = {}
    for language in parallel_corpora['language1'].unique():
        languages[language] = Language()
        texts = parallel_corpora.loc[parallel_corpora['language1'] == language]['language1_text']
        sentences = np.unique(texts.values)
        for sentence in sentences:
            languages[language].add_sentence(sentence)
    for language in parallel_corpora['language2'].unique():
        languages[language] = Language()
        texts = parallel_corpora.loc[parallel_corpora['language2'] == language]['language2_text']
        sentences = np.unique(texts.values)
        for sentence in sentences:
            languages[language].add_sentence(sentence)
    return languages

In [6]:
languages = prepare_languages()

In [7]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [8]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, max_length):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.max_length = max_length

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(start_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(self.max_length):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [9]:
parallel_corpora['language1_text_word_count'] = parallel_corpora['language1_text_words'].apply(len)
parallel_corpora['language2_text_word_count'] = parallel_corpora['language2_text_words'].apply(len)

parallel_corpora.sort_values(by=['language1_text_word_count', 'language2_text_word_count'], ascending=False)

Unnamed: 0.1,Unnamed: 0,language1,language2,book,chapter,verse,language1_text,language2_text,language1_text_words,language2_text_words,language1_text_word_count,language2_text_word_count
978272,978272,english,filipino,EST,8,9,Then were the king's scribes called at that ti...,Ang mga kalihim ng hari ay ipinatawag nang pan...,"[Then, were, the, king, s, scribes, called, at...","[Ang, mga, kalihim, ng, hari, ay, ipinatawag, ...",91,93
982977,982977,english,filipino,JER,21,7,"And afterward, saith the LORD, I will deliver ...","At pagkatapos, sabi ng Panginoon, ibibigay ko ...","[And, afterward, saith, the, LORD, I, will, de...","[At, pagkatapos, sabi, ng, Panginoon, ibibigay...",83,76
985194,985194,english,filipino,EZK,46,9,But when the people of the land shall come bef...,"""Kapag ang bayan ng lupain ay haharap sa Pangi...","[But, when, the, people, of, the, land, shall,...","[Kapag, ang, bayan, ng, lupain, ay, haharap, s...",81,54
975424,975424,english,filipino,2KI,16,15,"And king Ahaz commanded Urijah the priest, say...",At inutusan ni Haring Ahaz ang paring si Urias...,"[And, king, Ahaz, commanded, Urijah, the, prie...","[At, inutusan, ni, Haring, Ahaz, ang, paring, ...",80,102
971481,971481,english,filipino,JOS,8,33,"And all Israel, and their elders, and officers...","At ang buong Israel, maging dayuhan o katutubo...","[And, all, Israel, and, their, elders, and, of...","[At, ang, buong, Israel, maging, dayuhan, o, k...",80,80
...,...,...,...,...,...,...,...,...,...,...,...,...
978073,978073,english,filipino,NEH,12,3,"Shechaniah, Rehum, Meremoth,","Shecanias, Rehum, Meremot;","[Shechaniah, Rehum, Meremoth]","[Shecanias, Rehum, Meremot]",3,3
978074,978074,english,filipino,NEH,12,4,"Iddo, Ginnetho, Abijah,","Iddo, Ginetoi, Abias;","[Iddo, Ginnetho, Abijah]","[Iddo, Ginetoi, Abias]",3,3
978075,978075,english,filipino,NEH,12,5,"Miamin, Maadiah, Bilgah,","Mijamin, Maadias, Bilga;","[Miamin, Maadiah, Bilgah]","[Mijamin, Maadias, Bilga]",3,3
990086,990086,english,filipino,JHN,11,35,Jesus wept.,Umiyak si Jesus.,"[Jesus, wept]","[Umiyak, si, Jesus]",2,3


In [10]:
#max_length = max(
#    pairs['language1_text'].apply(lambda x: x.count(' ') + 1).max(),
#    pairs['language2_text'].apply(lambda x: x.count(' ') + 1).max()
#)
max_length = 200

In [11]:
pairs = parallel_corpora.loc[
    (parallel_corpora['language1'] == 'english')
      & (parallel_corpora['language2'] == 'filipino')
      & (parallel_corpora['language1_text_word_count'] <= max_length - 1)
      & (parallel_corpora['language2_text_word_count'] <= max_length)
    ][['language1_text', 'language2_text']]

In [12]:
pairs.head(5)

Unnamed: 0,language1_text,language2_text
965446,In the beginning God created the heaven and th...,"Nang pasimula, nilikha ng Diyos ang langit at ..."
965447,"And the earth was without form, and void; and ...","Ang lupa ay walang anyo at walang laman, at bi..."
965448,"And God said, Let there be light: and there wa...","At sinabi ng Diyos, ""Magkaroon ng liwanag,"" at..."
965449,"And God saw the light, that it was good: and G...","Nakita ng Diyos na ang liwanag ay mabuti, at i..."
965450,"And God called the light Day, and the darkness...","Tinawag ng Diyos ang liwanag na Araw, at ang k..."


In [13]:
len(pairs)

29184

In [14]:
n = len(pairs)
input_ids = np.zeros((n, max_length), dtype=np.int32)
target_ids = np.zeros((n, max_length), dtype=np.int32)

In [15]:
for idx, tup in enumerate(pairs.itertuples()):
    inp_ids = languages['english'].indices_from_sentence(tup.language1_text)
    tgt_ids = languages['filipino'].indices_from_sentence(tup.language2_text)
    inp_ids.append(end_token)
    tgt_ids.append(end_token)
    input_ids[idx, :len(inp_ids)] = inp_ids
    target_ids[idx, :len(tgt_ids)] = tgt_ids

In [16]:
train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32, num_workers=12)

In [None]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):
    data_count = 0
    total_loss = 0
    for data in dataloader:
        data_count += 1
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()
        if data_count % 10 == 0:
            print(f'Processed {data_count} batches')

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [18]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [19]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [20]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.003,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [21]:
def evaluate(encoder, decoder, sentence, input_lang: Language, output_lang: Language):
    with torch.no_grad():
        input_tensor = input_lang.tensors_from_sentence(sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == end_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index_word_map[idx.item()])
    return decoded_words, decoder_attn

In [22]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs.values)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], languages['english'], languages['filipino'])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [25]:
hidden_size = 256

#print(len(languages['english'].index_word_map), len(languages['filipino'].index_word_map))

encoder = EncoderRNN(len(languages['english'].index_word_map), hidden_size).to(device)
decoder = DecoderRNN(hidden_size, len(languages['filipino'].index_word_map), max_length).to(device)
     
print(len(pairs))
train(train_dataloader, encoder, decoder, 60, print_every=1, plot_every=1)

29184


KeyboardInterrupt: 

In [None]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

In [None]:
torch.save(encoder.state_dict, 'encoder.model')

In [None]:
torch.save(decoder.state_dict, 'decoder.model')

In [None]:
output_words, _ = evaluate(encoder, decoder, "Sila ay galing sa Laguna", languages['english'], languages['filipino'])
output_sentence = ' '.join(output_words)
output_sentence