# Word Inflection Task

## First import the data

In [145]:
import os.path

CONLL_SIGMORPHON_DATA_PATH="./conll2017/"

## Reading in Data using Torchtext


In [146]:
import torchtext
from torchtext.data import Field, ReversibleField
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator

# Padding symbol for batching, unknown symbol used for input characters, which were not attested in the training 
# set, and start & end of sequence symbols.
PAD="<pad>"
UNK="<unk>"
START="<start>"
END="<end>"

### Defining Data Fields

The CoNLL-SIGMORPHON shared task datasets are in tsv (tab-separated values) format. Each line in the files has three fields: a lemma, a word form and a morphosyntactic description. Here are the ten first lines from the file `conll2017/all/task1/english-train-high`

```
malinvest       malinvest        V;NFIN
engender        engender         V;NFIN
stodge          stodged          V;PST
oversew         oversew          V;NFIN
psychoanalyse   psychoanalysing  V;V.PTCP;PRS
foray           forayed          V;V.PTCP;PST
DIY             DIYs             V;3;SG;PRS
pearl           pearling         V;V.PTCP;PRS
use             uses             V;3;SG;PRS
strong-arm      strong-armed     V;PST
```

Our task is to read shared task datasets into `torchtext.data.Dataset` objects. The first step toward this goal is to define tokenizers for the datasets. We need to define two tokenizers `word_tok` and `msd_tok`. These should split words and lemmas into character sequences and split morphosyntactic descriptors into sequences of features like `FEAT=V` and `FEAT=V.PTCP`. Note that `msd_tok` should also prepend `FEAT=` to each feature. 

In [136]:
# Define two tokenizers word_tok and tag_tok. word_tok is used for words and lemmas. msd_tok 
# is used for morphosyntactic descriptors.                                                                                                                                                  

def word_tok(word):
    '''
    split words/lemmas into character sequences
    '''
    return list(word)

def msd_tok(mor_des):
    '''
    split morphosyntactic descriptors into sequences of features
    '''
    
    return ["FEAT="+d for d in mor_des.split(";")]


Now we need to define `torchtext.data.Field` objects for reading in word form, lemma and MSD fields in the datasets. We should define two pytorch fields `WORD` and `MSD`. The `WORD` field describes both lemmas and word forms and the `MSD` field describes morphosyntactic descriptors.

In [137]:
# Define two torchtext fields WORD and MSD for handling words, lemmas and 
# morphosyntactic descriptors. These will take word_tok and msd_tok as arguments.

WORD = Field(sequential=True,
             tokenize = word_tok,
            pad_token = PAD,
            unk_token = UNK,
        init_token = START,
            eos_token = END,
            include_lengths=True)
MSD = Field(sequential=True,
             tokenize = msd_tok)


# These assertions check that your Field objects work correctly when reading the line:
# stodge    stodged    V;PST
assert(WORD.preprocess("stodged") == ["s","t","o","d","g","e","d"])
assert(MSD.preprocess("V;PST") == ["FEAT=V","FEAT=PST"])

# Our data sets have three fields: lemma (input), word form (output) and MSD.                                                                                                                            
datafields = [("input", WORD), ("output", WORD), ("msd", MSD)]

### A Function for Reading in Data

Define the function `read_data` which will read in the shared task datasets. The function takes three arguments: the target language, the data setting ("high" for 10,000 training examples, "medium" for 1,000 training examples or "low" for 100 training examples) and the batch size. 

Note that `train_iter` should shuffle examples between epochs but `dev_iter` and `test_iter` should not in order to retain the correct order of development and test examples for evaluation of accuracy. None of the iterators should repeat over multiple epochs. As `device` you should use `"cpu"` unless you have access to a GPU.

After defining the `read_data` function, use it to read in the English shared task data for the "medium" setting which should give you three data iterators: `train_iter` (iterator over 1,000 English training examples), `dev_iter` (iterator over the English development data) and `test_iter` (iterator over the English test data).

In [138]:
def read_data(language,setting,batch_size=1):
    """Read shared task training, development and test sets for a particular language and
       return torchtext Iterators to the data. 
    """
    train, dev = TabularDataset.splits(
        path="%s/all/task1/" % CONLL_SIGMORPHON_DATA_PATH,
        train='%s-train-%s' % (language,setting),
        validation="%s-dev" % language,
        format='tsv',
        skip_header=True,
        fields=datafields)

    test = TabularDataset(
        path="%s/answers/task1/%s-uncovered-test" % (CONLL_SIGMORPHON_DATA_PATH,language),
        format='tsv',
        skip_header=True,
        fields=datafields)
 
    # Concatenate the lemma and MSD fields into a joint input field.
    for data in [train, dev, test]:
        for ex in data:
            ex.input = ex.input + ex.msd
    
    # Build vocabularies                                                        
    WORD.build_vocab(train)
    MSD.build_vocab(train)

    # Define train_iter, dev_iter and test_iter iterators over the training data, 
    # development data and test data, respectively.

    train_iter = BucketIterator(train,
                          batch_size=batch_size,
                          sort_key=len,
                          shuffle=True,
                         device = "cpu")
    
    dev_iter, test_iter = Iterator.splits((dev, test),
                                     batch_sizes=(batch_size, batch_size),
                                     sort=False,
                                     shuffle=False,
                                    device = "cpu")

    return train_iter, dev_iter, test_iter

# Now call read_data to initialize train, dev and test data iterators for English. 
# Use batch_size 1.

train_iter, dev_iter, test_iter = read_data(language="english",
                                            setting="medium",
                                            batch_size=1)

In [139]:
# Print the first training example
example = next(iter(train_iter))
print(example)


[torchtext.data.batch.Batch of size 1]
	[.input]:('[torch.LongTensor of size 14x1]', '[torch.LongTensor of size 1]')
	[.output]:('[torch.LongTensor of size 13x1]', '[torch.LongTensor of size 1]')
	[.msd]:[torch.LongTensor of size 2x1]


## Basic Encoder-Decoder Model

This first version of the model does not implement attention. It simply uses a bidirectional encoder to encode the entire input sequence (for example, `<start> s t o d g e FEAT=V FEAT=PST <end>`) into a single hidden state vector which is then fed into a decoder network which generates the output word form (`s t o d g e d`) one symbol at a time. 

Let's start by loading some important requirements and defining model hyperparameters.

In [140]:
import numpy as np

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.nn.functional import log_softmax
from torch.optim import Adam, SGD

from random import random, seed, shuffle

# Ensure reproducible results.
seed(0)
torch.manual_seed(0)
np.random.seed(0)

import re

# Hyperparameters
EMBEDDING_DIM=50
RNN_HIDDEN_DIM=50
RNN_LAYERS=1
BATCH_SIZE=10
CHAR_DROPOUT=0.0
EPOCHS=10

# Maximum length of generated output word forms.
MAXWFLEN=40

def accuracy(sys,gold):
    assert(len(sys) == len(gold))
    return sum([1 if x==y else 0 for x,y in zip(sys,gold)])*100.0/len(gold)

#### Encoder Network


In [97]:
class Encoder(nn.Module):
        def __init__(self,alphabet):
                super(Encoder,self).__init__()
                self.embedding = nn.Embedding(len(alphabet), EMBEDDING_DIM)
                self.rnn = nn.LSTM(EMBEDDING_DIM, RNN_HIDDEN_DIM, RNN_LAYERS, bidirectional=True)

        def forward(self,ex):
            input, _ = ex.input
            
            embedded = self.embedding(input)
            # embedded = [seq_len, 1, emd_dim]
            
            _, hn = self.rnn(embedded)
            
            hn = hn[0]
            
            cat = hn.view(1, 1, 2 * hn.size()[2])
            
            return cat

# An assertion to test the implementation returns a tensor of the correct size. 
assert(Encoder(WORD.vocab.stoi)(example).size() == torch.Size([1,1,2*RNN_HIDDEN_DIM]))

#### Decoder Network


In [99]:
class Decoder(nn.Module):
    def __init__(self, alphabet):
        super(Decoder,self).__init__()
        self.alphabet = alphabet
        self.embedding = nn.Embedding(len(alphabet), EMBEDDING_DIM)
        self.rnn = nn.LSTM(EMBEDDING_DIM+2*RNN_HIDDEN_DIM, RNN_HIDDEN_DIM, RNN_LAYERS, bidirectional=False)
        self.hidden2char = nn.Linear(RNN_HIDDEN_DIM, len(alphabet))
        
    def forward(self,ex,encoder_hs):
        # encoder_hs = [1, 1, 2 * hid_dim]
        
        output, output_length = ex.output
        
        embedded_output = self.embedding(output[:-1])
        # embedded_output = [out_len - 1, 1, emb_dim]
        
        encoder_to_cat = encoder_hs.expand(output_length -1, 1, 2 * RNN_HIDDEN_DIM)
                
        decoder_input = torch.cat((embedded_output, encoder_to_cat), dim = 2)
        # decoder_input = [out_len - 1, 1, emb_dim + 2 * hid_dim]
        
        decoder_hidden_states, _ = self.rnn(decoder_input)
        # decoder_hidden_states = [out_len - 1, 1, hid_dim]
        
        hidden2char = self.hidden2char(decoder_hidden_states)
        # hidden2char = [out_len -1, 1, len(alphabet)]
        
        distr = nn.functional.log_softmax(hidden2char, dim = 2)
        # distr = [out_len -1, 1, len(alphabet)]
        
        return distr, output[1:]
        
    def generate(self,encoder_hs):
        # We're not accumulating gradients during test time.
        with torch.no_grad():
            decoder_state = (torch.zeros(1,1,RNN_HIDDEN_DIM), torch.zeros(1,1,RNN_HIDDEN_DIM))
            output_char = torch.LongTensor([[self.alphabet[START]]])
            result = []
            for _ in range(MAXWFLEN):

                output_embedding = self.embedding(output_char)
                # output_embedding = [1, 1, emd_dim]
                
                # encoder_hs = [1, 1, 2 * hid_dim]
                
                cat = torch.cat((output_embedding, encoder_hs), dim = 2)
                
                _, decoder_state = self.rnn(cat, decoder_state)
                
                hs = decoder_state[0]
                
                # hs = [1, 1, hid_dim]
                
                hid2char = self.hidden2char(hs)
                
                # hid2char = [1, 1, len(alphabet)]
                
                hid2char = nn.functional.log_softmax(hid2char, dim = 2)
                
                output_char = torch.argmax(hid2char, dim = 2)

                result.append(output_char.numpy().tolist()[0][0])
            
            return result
            
# Assertions to test the implementation returns objects of the correct size. 
alphabet = WORD.vocab.stoi
encoder_hs = Encoder(alphabet)(example)
_, output_length = example.output
alphabet_size = len(alphabet)

assert(Decoder(alphabet)(example,encoder_hs)[0].size() == torch.Size([output_length - 1,1,alphabet_size]))
assert(len(Decoder(alphabet).generate(encoder_hs)) == MAXWFLEN)

#### Training the Model

In [100]:
class WordInflector(nn.Module):
    def __init__(self, alphabet):
        super(WordInflector, self).__init__()
        self.alphabet = alphabet.stoi
        self.integer2char = alphabet.itos
        alphabet_size = len(self.alphabet)
        
        self.encoder = Encoder(self.alphabet)
        self.decoder = Decoder(self.alphabet)
    
    def get_string(self,ids):
        string = ''.join([self.integer2char[i] for i in ids])
        return re.sub("%s.*" % END,"",string)

    def forward(self, example):
        encoder_hs = self.encoder(example)
        return self.decoder(example,encoder_hs)
            
    def generate(self, data):
        all_results = []
        with torch.no_grad():
            for example in data:
                encoder_hs = self.encoder(example)
                output = self.decoder.generate(encoder_hs)
                all_results.append(self.get_string(output))
        return all_results
    
if __name__=="__main__":
    # Read the Spanish medium data set.
    train_iter, dev_iter, test_iter = read_data(language="spanish",
                                            setting="medium",
                                            batch_size=1)
    
    inflector = WordInflector(WORD.vocab)

    loss_function = nn.NLLLoss(ignore_index=inflector.alphabet[PAD],reduction='mean')
    optimizer = Adam(inflector.parameters())
    gold_dev_words = [''.join(w.output) for w in dev_iter.dataset]

    for epoch in range(EPOCHS):
        tot_loss = 0 

        # Update parameters
        for i, batch in enumerate(train_iter):
            print("Example %u of %u" % (i+1,len(train_iter)),end="\r")
            inflector.zero_grad()
            tag_scores, tgt = inflector(batch)
            tgt = tgt.permute(1,0)
            tag_scores = tag_scores.permute(1,2,0)
            loss = loss_function(tag_scores,tgt) 
            loss.backward()
            optimizer.step()
            tot_loss += len(batch)*loss.detach().numpy()
        print()
        avg_loss = tot_loss/len(train_iter)
        print("EPOCH %u: AVG LOSS PER EX: %.5f" % (epoch+1,avg_loss))        

        # Evaluate on dev data.
        sys_dev_words = inflector.generate(dev_iter)
        print("DEV ACC: %.2f%%" % accuracy(sys_dev_words,gold_dev_words))

Example 999 of 999
EPOCH 1: AVG LOSS PER EX: 2.48444
DEV ACC: 0.00%
Example 999 of 999
EPOCH 2: AVG LOSS PER EX: 1.85903
DEV ACC: 0.20%
Example 999 of 999
EPOCH 3: AVG LOSS PER EX: 1.48870
DEV ACC: 0.30%
Example 999 of 999
EPOCH 4: AVG LOSS PER EX: 1.20146
DEV ACC: 1.50%
Example 999 of 999
EPOCH 5: AVG LOSS PER EX: 0.99420
DEV ACC: 2.50%
Example 999 of 999
EPOCH 6: AVG LOSS PER EX: 0.83096
DEV ACC: 3.70%
Example 999 of 999
EPOCH 7: AVG LOSS PER EX: 0.70516
DEV ACC: 5.81%
Example 999 of 999
EPOCH 8: AVG LOSS PER EX: 0.60533
DEV ACC: 5.71%
Example 999 of 999
EPOCH 9: AVG LOSS PER EX: 0.53115
DEV ACC: 8.21%
Example 999 of 999
EPOCH 10: AVG LOSS PER EX: 0.45551
DEV ACC: 9.91%


## Attention

Augment the model with attention!

#### Encoder


In [147]:
class Encoder(nn.Module):
        def __init__(self,alphabet):
                super(Encoder,self).__init__()
                self.embedding = nn.Embedding(len(alphabet), EMBEDDING_DIM)
                self.rnn = nn.LSTM(EMBEDDING_DIM, RNN_HIDDEN_DIM, RNN_LAYERS, bidirectional=True)

        def forward(self,ex):
            input, _ = ex.input
            
            embedded = self.embedding(input)
            # embedded = [seq_len, 1, emd_dim]
            
            hs, _ = self.rnn(embedded)
            
            return hs

# An assertion to test that your implementation returns an object of the correct size. 
input, input_length = example.input
assert(Encoder(WORD.vocab.stoi)(example).size() == torch.Size([input_length,1,2*RNN_HIDDEN_DIM]))

#### Add Attention


The `Attention` class implements a version of [Bahdanau attention](https://blog.floydhub.com/attention-mechanism/). Its `forward` function takes two inputs: a tensor of encoder hidden states `encoder_hss` of dimension `(sequence_length, 1, 2*RNN_HIDDEN_DIM)` and a decoder hidden state `dec_state` of dimension `(1,1,RNN_HIDDEN_DIM)`. It computes a context weight for each of the encoder hidden states and the decoder hidden state using a feed-forward neural network with one hidden layer and a ReLU non-linearity. These weights are then normalized into a probability distribution $p_1, ..., p_T$ using a softmax layer. Finally, `forward` will return the weighted mean $p_1 e_1 + ... + p_T e_T$.      


In [148]:
class Attention(nn.Module):
    def __init__(self):
        super(Attention,self).__init__()

        self.linear1 = nn.Linear(3*RNN_HIDDEN_DIM,RNN_HIDDEN_DIM)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(RNN_HIDDEN_DIM,1)
    
    def forward(self,encoder_hss,decoder_hs):
        
        # encoder_hss = [seq_len, 1, 2 * hid_dim]
        # decoder_hs = [1, 1, hid_dim]
        
        decoder_hss = decoder_hs.expand(encoder_hss.size()[0], 1, RNN_HIDDEN_DIM)
        
        conditioned = torch.cat((encoder_hss, decoder_hss), dim = 2)
        
        # conditioned = [seq_len, 1, 3 * hid_dim]
        
        att = self.linear1(conditioned)
        
        att1 = self.relu(att)
        
        # att1 = [seq_len, 1, hid_dim]
        
        att2 = self.linear2(att1)
        
        # att2 = [seq_len, 1, 1]
        att2 = nn.functional.softmax(att2, dim = 0)
        weights = att2.expand(att2.size()[0], 1, 2 * RNN_HIDDEN_DIM)
        
        # weights = [seq_len, 1, 2 * RNN_HIDDEN_DIM]
        
        product = weights * encoder_hss
        
        # product = [seq_len, 1, 2 * RNN_HIDDEN_DIM]
        
        weighted_mean = torch.sum(product, dim = 0).unsqueeze(0)
        # weight_mean = [1, 1, 2 * RNN_HIDDEN_DIM]
        
        
        return weighted_mean

# An assertion to test that your implementation returns an object of the correct size. 
input, input_length = example.input
encoder_hss = Encoder(WORD.vocab.stoi)(example)
decoder_hs = torch.randn(1,1,RNN_HIDDEN_DIM)

assert(Attention()(encoder_hss,decoder_hs).size() == torch.Size([1,1,2*RNN_HIDDEN_DIM]))

#### Decoder


In [150]:
class Decoder(nn.Module):
    def __init__(self, alphabet):
        super(Decoder,self).__init__()
        self.alphabet = alphabet
        self.embedding = nn.Embedding(len(alphabet), EMBEDDING_DIM)
        self.attention = Attention()
        self.rnn = nn.LSTM(EMBEDDING_DIM+2*RNN_HIDDEN_DIM, RNN_HIDDEN_DIM, RNN_LAYERS, bidirectional=False)
        self.hidden2char = nn.Linear(RNN_HIDDEN_DIM, len(alphabet))
    
    def forward(self,ex,encoder_hss):
        output, output_length = ex.output
        embedded_output = self.embedding(output[:-1])
        # embedded_output = [output_length - 1,1,EMBEDDING_DIM]
        
        results = []
        decoder_state = (torch.zeros(1,1,RNN_HIDDEN_DIM,requires_grad=False), 
                         torch.zeros(1,1,RNN_HIDDEN_DIM,requires_grad=False))
        
            
        for i in range(output_length - 1):

            context = self.attention(encoder_hss, decoder_state[0])
            # context = [1,1,2*RNN_HIDDEN_DIM]

            cat = torch.cat((embedded_output[i, :, :].unsqueeze(0), context), dim = 2)

            _, decoder_state = self.rnn(cat, decoder_state)

            hs = decoder_state[0]

            hid2char = self.hidden2char(hs)
            
            hid2char = nn.functional.log_softmax(hid2char, dim = 2)

            results.append(hid2char)
                
        
        out = torch.cat(results, dim = 0)
        # out = [seq_len - 1, 1, alphabet_size]
        
        return out, output[1:]

    def generate(self,encoder_hss):
        with torch.no_grad():
            decoder_state = (torch.zeros(1,1,RNN_HIDDEN_DIM), torch.zeros(1,1,RNN_HIDDEN_DIM))
            output_char = torch.LongTensor([[self.alphabet[START]]])
            result = []
            for _ in range(MAXWFLEN):
                
                output_embedding = self.embedding(output_char)
                
                # output_embedding = [1, 1, emb_dim]
                
                context = self.attention(encoder_hss, decoder_state[0])
                
                # context = [1, 1, 2 * hid_dim]
                
                cat = torch.cat((output_embedding, context), dim = 2)
                
                # cat = [1, 1, emd_dim + 2 * hid_dim]
                
                _, decoder_state = self.rnn(cat, decoder_state)
                
                hs = decoder_state[0]
                
                # hs = [1, 1, hid_dim]
                
                hid2char = self.hidden2char(hs)
                
                # hid2char = [1, 1, len(alphabet)]
                
                hid2char = nn.functional.log_softmax(hid2char, dim = 2)
                
                output_char = torch.argmax(hid2char, dim = 2)
                               
                result.append(output_char.numpy().tolist()[0][0])
                
        return result
            
# Assertions to test the implementation returns objects of the correct size. 
encoder_hs = Encoder(WORD.vocab.stoi)(example)
_, output_length = example.output
alphabet = WORD.vocab.stoi
alphabet_size = len(alphabet)
assert(Decoder(alphabet)(example,encoder_hs)[0].size() == torch.Size([output_length - 1,1,alphabet_size]))
assert(len(Decoder(alphabet).generate(encoder_hs)) == MAXWFLEN)

#### Training the Model

In [151]:
class WordInflector(nn.Module):
    def __init__(self, alphabet):
        super(WordInflector, self).__init__()
        self.c2i = alphabet.stoi
        self.i2c = alphabet.itos
        alphabet_size = len(self.c2i)
        
        self.encoder = Encoder(self.c2i)
        self.decoder = Decoder(self.c2i)
    
    def get_string(self,ids):
        string = ''.join([self.i2c[i] for i in ids])
        return re.sub("%s.*" % END,"",string)

    def forward(self, example):
        encoder_hs = self.encoder(example)
        return self.decoder(example,encoder_hs)
            
    def generate(self, data):
        all_results = []
        with torch.no_grad():
            for example in data:
                encoder_hs = self.encoder(example)
                output = self.decoder.generate(encoder_hs)
                all_results.append(self.get_string(output))
        return all_results
    
if __name__=="__main__":
    train_iter, dev_iter, test_iter = read_data(language="spanish",
                                            setting="medium",
                                            batch_size=1)
    
    inflector = WordInflector(WORD.vocab)

    loss_function = nn.NLLLoss(ignore_index=inflector.c2i[PAD],reduction='mean')
    optimizer = Adam(inflector.parameters())
    gold_dev_words = [''.join(w.output) for w in dev_iter.dataset]

    for epoch in range(EPOCHS):
        tot_loss = 0 

        # Update parameters
        for i, batch in enumerate(train_iter):
            print("Example %u of %u" % (i+1,len(train_iter)),end="\r")
            inflector.zero_grad()
            tag_scores, tgt = inflector(batch)
            tgt = tgt.permute(1,0)
            tag_scores = tag_scores.permute(1,2,0)
            loss = loss_function(tag_scores,tgt) 
            tot_loss += loss.detach().numpy()
            loss.backward()
            optimizer.step()
        print()
        avg_loss = tot_loss/len(train_iter)
        print("EPOCH %u: AVG LOSS PER EX: %.5f" % (epoch+1,avg_loss))        

        # Evaluate on dev data.
        sys_dev_words = inflector.generate(dev_iter)
        print("DEV ACC: %.2f%%" % accuracy(sys_dev_words,gold_dev_words))

Example 999 of 999
EPOCH 1: AVG LOSS PER EX: 2.44053
DEV ACC: 0.00%
Example 999 of 999
EPOCH 2: AVG LOSS PER EX: 1.43504
DEV ACC: 1.80%
Example 999 of 999
EPOCH 3: AVG LOSS PER EX: 0.83666
DEV ACC: 18.32%
Example 999 of 999
EPOCH 4: AVG LOSS PER EX: 0.52703
DEV ACC: 43.34%
Example 999 of 999
EPOCH 5: AVG LOSS PER EX: 0.38147
DEV ACC: 47.85%
Example 999 of 999
EPOCH 6: AVG LOSS PER EX: 0.31022
DEV ACC: 41.34%
Example 999 of 999
EPOCH 7: AVG LOSS PER EX: 0.25495
DEV ACC: 39.84%
Example 999 of 999
EPOCH 8: AVG LOSS PER EX: 0.21282
DEV ACC: 63.16%
Example 999 of 999
EPOCH 9: AVG LOSS PER EX: 0.17004
DEV ACC: 51.55%
Example 999 of 999
EPOCH 10: AVG LOSS PER EX: 0.15390
DEV ACC: 69.37%
