In [44]:
import numpy as np
from tqdm import tqdm_notebook

import torch
from torch import nn, optim
import torch.nn.functional as F

In [98]:
def prepare_sequence(seq, to_ix):
    idxs = []
    
    for w in seq:
        if w in to_ix:
            idxs.append(to_ix[w])
        else:
            idxs.append(len(to_ix))
    
    return torch.tensor(idxs, dtype=torch.long)

def produce_tags(seq, to_tag):
    tags = [to_tag[x] for x in seq]
    return tags

In [94]:
dataset = "EN" # {SG, CN, EN, FR}
train_filename = f"data/{dataset}/train"
validation_filename = f"data/{dataset}/dev.out"

In [88]:
# Prepare training data
with open(train_filename, "r") as f:
    lines = f.readlines()

train_data = []
sentence = []
tags = []

word_to_ix = {}
tag_to_ix = {}

for line in lines:
    line_split = line.strip().split(" ")
    
    if len(line_split) == 1:
        # New sentence
        train_data.append([sentence, tags])
        sentence = []
        tags = []
    else:
        word, tag = line_split[0], line_split[1]
        
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

        sentence.append(word)
        tags.append(tag)
        
idx_to_tags = {v: k for k, v in tag_to_ix.items()}

In [21]:
max_sentence_length = max(data, key=len)
print(len(max_sentence_length))

39


In [33]:
# TODO: Make this b i g g e r
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [102]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix)+1, len(tag_to_ix)) # +1 for unknown words
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in tqdm_notebook(range(10)):  # again, normally you would NOT do 300 epochs, it is toy data
    total_loss = 0
    num_batches = len(train_data)
    for sentence, tags in train_data:
        model.zero_grad()
        model.hidden = model.init_hidden()

        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        preds = model(sentence_in)

        loss = loss_function(preds, targets)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    print("Average loss: {}".format(total_loss / num_batches))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Average loss: 2.156482021951416
Average loss: 1.9685045199472113
Average loss: 1.8808772772712845
Average loss: 1.8131939323108555
Average loss: 1.7660102441825798
Average loss: 1.726295224335146
Average loss: 1.6878789453454546
Average loss: 1.650927442820665
Average loss: 1.615967040702349
Average loss: 1.5832934842568345



In [105]:
# Prepare validation data
with open(validation_filename, "r") as f:
    lines = f.readlines()

val_data = []
sentence = []
tags = []

for line in lines:
    line_split = line.strip().split(" ")
    
    if len(line_split) == 1:
        # New sentence
        val_data.append([sentence, tags])
        sentence = []
        tags = []
    else:
        word, tag = line_split[0], line_split[1]
        
        sentence.append(word)
        tags.append(tag)

In [107]:
val_preds = []

# See what the scores are after training
with torch.no_grad():
    for i in range(len(val_data)):
        sentence = val_data[i][0]
        inputs = prepare_sequence(sentence, word_to_ix)
        preds = model(inputs)
        tags = preds.argmax(dim=1).data.numpy()

        val_preds.append(produce_tags(tags, idx_to_tags))

In [108]:
len(val_preds)

78

In [116]:
with open(validation_filename.replace(".out", "_lstm.pred"), "w") as f:
    for i in range(len(val_data)):
        sentence = val_data[i][0]
        pred = val_preds[i]
        assert(len(sentence) == len(pred))
        
        for j in range(len(sentence)):
            f.write(sentence[j] + " " + pred[j] + "\n")
        
        f.write("\n")