In [1]:
import torch
import torch.nn as nn
import torch.nn.init as init
from torch import optim
import numpy as np

import random
import os

seed = 10

random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

In [2]:
TEST_FILE = "data/test.txt"
TRAIN_FILE = "data/train.txt"
WHOLE_FILE = "data/whole.txt"
F_VOCAB_FILE = "data/vocab.f.txt"
Q_VOCAB_FILE = "data/vocab.q.txt"

In [3]:
class LSTM(nn.Module):
    def __init__(self, opt):
        super(LSTM, self).__init__()
        self.opt = opt
        self.i2h = nn.Linear(opt.rnn_size, 4 * opt.rnn_size)
        self.h2h = nn.Linear(opt.rnn_size, 4 * opt.rnn_size)
        
    def forward(self, x, prev_c, prev_h):
        gates = self.i2h(x) + self.h2h(prev_h)
        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
        ingate = torch.sigmoid(ingate)
        forgetgate = torch.sigmoid(forgetgate)
        cellgate = torch.tanh(cellgate)
        outgate = torch.sigmoid(outgate)
        cy = (forgetgate * prev_c) + (ingate * cellgate)
        hy = outgate * torch.tanh(cy)  # n_b x hidden_dim
        return cy, hy


In [4]:
class EncoderRNN(nn.Module):
    def __init__(self, opt, input_size):
        super(EncoderRNN, self).__init__()
        self.opt = opt
        self.hidden_size = opt.rnn_size
        self.embedding = nn.Embedding(input_size, self.hidden_size)
        self.lstm = LSTM(self.opt)
        self.__initParameters()
        
    def __initParameters(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                init.uniform_(param, -opt.init_weight, opt.init_weight)
                
    def forward(self, input_src, prev_c, prev_h):
        src_emb = self.embedding(input_src) # batch_size x src_length x emb_size
        prev_cy, prev_hy = self.lstm(src_emb, prev_c, prev_h)
        return prev_cy, prev_hy

In [5]:
class DecoderRNN(nn.Module):
    def __init__(self, opt, output_size):
        super(DecoderRNN, self).__init__()
        self.opt = opt
        self.hidden_size = opt.rnn_size

        self.embedding = nn.Embedding(output_size, self.hidden_size)
        self.lstm = LSTM(self.opt)
        self.linear = nn.Linear(self.hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.__initParameters()

    def __initParameters(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                init.uniform_(param, -opt.init_weight, opt.init_weight)
                
    def forward(self, input, prev_c, prev_h):
        output = self.embedding(input)
        next_c, next_h = self.lstm(output, prev_c, prev_h)
        h2y = self.linear(next_h)
        pred = self.softmax(h2y)
        return pred, next_c, next_h

In [6]:
class Options:
    def __init__(self):
        self.rnn_size = 50
        self.dropout = False
        self.init_weight = 0.08
        self.decay_rate = 0.985
        self.learning_rate = 0.01
        self.plot_every = 50
        self.print_every = 50
        
opt = Options()

In [7]:
def read_data(fh):
    for line in fh:
        sentence, lf = line.strip().split("\t")
        sentence = sentence.split()
        lf = lf.split()
        yield sentence, lf

def read_vocab(filename):
    t2i = {"<s>": 0, "</s>":1, "UNK": 2}
    with open(filename) as target:
        for line in target:
            token = line.strip().split()[0]
            if token not in t2i:
                t2i[token] = len(t2i)
    return t2i

def is_equal(gold, predictions):
    total_correct = 0.0
    if len(gold) == len(predictions):
        equal = True
        for g, p in zip(gold, predictions):
            if g != p:
                equal = False
        return equal
    return False

In [8]:
def preprare_data(file_name):
    shuffledData = None
    with open(TRAIN_FILE, 'r') as train:
        shuffledData = list(read_data(train))
        random.shuffle(shuffledData)
    sentence_index_tensors = []
    form_index_tensors = []
    for sentence in shuffledData:
        text_tensor = torch.zeros((1, len(sentence[0]) + 2), dtype=torch.long)
        text_tensor[0][0] = w2i["<s>"]
        for idx, word in enumerate(sentence[0]):
            word_index = w2i[word] if word in w2i else w2i["UNK"]
            text_tensor[0][idx+1] = word_index
        text_tensor[0][-1] = w2i["</s>"]
        sentence_index_tensors.append(text_tensor)
        form_tensor = torch.zeros((1, len(sentence[1]) + 2), dtype=torch.long)
        form_tensor[0][0] = lf2i["<s>"]
        for idx, form in enumerate(sentence[1]):
            form_index = lf2i[form] if form in lf2i else lf2i["UNK"]
            form_tensor[0][idx+1] = form_index
        form_tensor[0][-1] = lf2i["</s>"]
        form_index_tensors.append(form_tensor)
    return shuffledData, sentence_index_tensors, form_index_tensors

In [9]:
def train(opt, criterion, encoder_optimizer, decoder_optimizer, encoder, decoder, s1, f1):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    c = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)
    h = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)
    for i in range(s1.size(1)):
        c, h = encoder(s1[:, i], c, h)

    #for dec_in in f1:
    loss = 0
    for i in range(f1.size(1)-1):
        pred, c, h = decoder(f1[:, i], c, h)
        loss += criterion(pred, f1[:, i+1])
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss

In [10]:
def predict(opt, s1, lf2i, encoder, decoder):
    c = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)
    h = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)

    for i in range(s1.size(1)):
        c, h = encoder(s1[:, i], c, h)

    prev = torch.tensor([lf2i['<s>']], dtype=torch.long)
    predicted_form = []
    counter = 0
    while True:
        counter += 1
        pred, c, h = decoder(prev, c, h)
        form_id = pred.argmax().item()
        prev = torch.tensor([form_id], dtype=torch.long)
        if form_id == lf2i["</s>"] or counter >= 100:
            break
        predicted_form.append(form_id)
    return predicted_form

In [11]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker


def showPlot(points, fig_name):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt.savefig("{}.png".format(fig_name))

In [12]:
def train_and_test(epoch_num, directory):
    train_data, sentence_index_tensors_train, form_index_tensors_train = preprare_data(TRAIN_FILE)
    test_data, sentence_index_tensors_test, form_index_tensors_test = preprare_data(TEST_FILE)
    
    encoder = EncoderRNN(opt, len(w2i))
    decoder = DecoderRNN(opt, len(lf2i))

    optim_state = {"learningRate" : opt.learning_rate, "alpha" :  opt.decay_rate}
    encoder_optimizer = optim.RMSprop(encoder.parameters(),  lr=optim_state["learningRate"], alpha=optim_state["alpha"])
    decoder_optimizer = optim.RMSprop(decoder.parameters(),  lr=optim_state["learningRate"], alpha=optim_state["alpha"])
    criterion = nn.NLLLoss(ignore_index=0)

    losses = []
    for epoch in range(epoch_num):
        print("---Epoch {}---\n".format(epoch+1))
        print("Training...")
        encoder.train()
        decoder.train()
        plot_data = []
        for index, (sentence, form) in enumerate(zip(sentence_index_tensors_train, form_index_tensors_train)):
            loss = train(opt, criterion, encoder_optimizer, decoder_optimizer, encoder, decoder, sentence, form)
            if index != 0:
                if index % opt.plot_every == 0:     
                    plot_data.append(np.mean(losses[index-opt.plot_every:]))
                if index % opt.print_every == 0:
                    print("Index {} Loss {}".format(index, np.mean(losses[index-opt.print_every:])))
            losses.append(loss.item())

        print("Predicting..")
        encoder.eval()
        decoder.eval()
        correct = 0.0
        for index, (sentence, form) in enumerate(zip(sentence_index_tensors_test, form_index_tensors_test)):
            prediction = predict(opt, sentence, lf2i, encoder, decoder)
            prediction = [i2lf[p] for p in prediction]
            #print(test_data[index][1])
            #print(prediction)
            same = True
            for g, p in zip(test_data[index][1], prediction):
                if g != p:
                    same = False
            if same:
                correct += 1
                #print("Correct match ", prediction)
        accuracy = 100*(correct/len(test_data))
        print("Accuracy: ", accuracy)
        
        if not os.path.exists(directory):
            os.makedirs(directory)
        file_name = "{}/epoch.{}.acc.{}".format(directory, epoch, accuracy)
        showPlot(plot_data, file_name)
    


In [None]:
w2i = read_vocab(Q_VOCAB_FILE)
lf2i = read_vocab(F_VOCAB_FILE)
i2lf = {lf2i[i] : i for i in lf2i}

In [None]:
train_and_test(20, "out/BaseExperiment")

---Epoch 1---

Training...
Index 50 Loss 37.03196257591247
Index 100 Loss 19.301327800750734
Index 150 Loss 18.62754457950592
Index 200 Loss 14.915855112075805
Index 250 Loss 10.929440026283265
Index 300 Loss 13.538310189247131
Index 350 Loss 13.100777382850646
Index 400 Loss 15.060738210678101
Index 450 Loss 13.375358805656433
Index 500 Loss 13.504880437850952
Index 550 Loss 10.511771326065064
Predicting..
Accuracy:  0.0
---Epoch 2---

Training...
Index 50 Loss 15.789523844718934
Index 100 Loss 13.757406987410326
Index 150 Loss 13.202313708892236
Index 200 Loss 12.556065191122201
Index 250 Loss 12.000034168316768
Index 300 Loss 11.933456575320317
Index 350 Loss 11.683809581536513
Index 400 Loss 11.57232490172753
Index 450 Loss 11.210556427882269
Index 500 Loss 11.002169471887441
Index 550 Loss 10.640501579137949
Predicting..
Accuracy:  10.166666666666666
---Epoch 3---

Training...
Index 50 Loss 13.156478379058838
Index 100 Loss 12.020983039283752
Index 150 Loss 11.634139756774902
Inde