In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import nltk
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import fasttext
import sys
from collections import namedtuple, defaultdict
from sklearn.metrics import classification_report
from embedding import EmbeddingReader
from crf.crf import ConditionalRandomField
from lstm import LSTMEncoder

## Data preparation: CONLL Dataset

In [3]:
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

In [4]:
def build_vocab(train_sentences):
    vocab = namedtuple('vocab', ['word2idx', 'idx2word'])
    vocab.word2idx, vocab.idx2word = dict(), dict()
    vocab.word2idx["<oov>"] = 0
    vocab.word2idx["<pad>"] = 1
    for sent in train_sentences:
        for word_tup in sent:
            word = word_tup[0]
            if word not in vocab.word2idx:
                vocab.word2idx[word] = len(vocab.word2idx)
    vocab.word2idx = defaultdict(lambda : vocab.word2idx["<oov>"], vocab.word2idx)
    vocab.idx2word = {v:k for k, v in vocab.word2idx.items()}
    return vocab

def build_tagmap(train_sentences):
    tagmap = namedtuple('tagmap', ['tag2idx', 'idx2tag'])
    tagmap.tag2idx, tagmap.idx2tag = dict(), dict()
    for sent in train_sentences:
        for word_tup in sent:
            tag = word_tup[2]
            if tag not in tagmap.tag2idx:
                tagmap.tag2idx[tag] = len(tagmap.tag2idx)
    tagmap.idx2tag = {v:k for k,v in tagmap.tag2idx.items()}
    return tagmap
                

def pad_sequence(lst_of_lsts, token):
    max_length = max(len(x) for x in lst_of_lsts)
    result = []
    for lst in lst_of_lsts:
        result.append(lst + [token] * (max_length - len(lst)))
    return result

def get_words_and_tags(batch, vocab, tagmap):
    batch_sent, batch_tags = [], []
    for sent in batch:
        words, tags = [], []
        for word_tup in sent:
            words.append(vocab.word2idx[word_tup[0]])
            tags.append(tagmap.tag2idx[word_tup[2]])
        batch_sent.append(words)
        batch_tags.append(tags)
    return batch_sent, batch_tags

def get_batch(sentences, vocab, tagmap, batch_size):
    num_sentences, i = len(sentences), 0
    while i < num_sentences:
        batch = sentences[i:i+batch_size]
        i += batch_size
        batch_sent, batch_tags = get_words_and_tags(batch, vocab, tagmap)
        batch_sent = pad_sequence(batch_sent, vocab.word2idx["<pad>"])
        batch_tags = pad_sequence(batch_tags, 0)
        yield batch_sent, batch_tags

def train_model(model, dataset, num_epochs, learning_rate, vocab, tagmap, batch_size):
    optimizer = optim.Adam(model.parameters(), learning_rate)

    for epoch in range(num_epochs):
        model.train()
        losses = []
        for batch_sent, batch_tags in get_batch(dataset, vocab, tagmap, batch_size):
            torch_batch_sent, torch_batch_tags = torch.LongTensor(batch_sent).to(device), torch.LongTensor(batch_tags).to(device)
            output = model(torch_batch_sent, torch_batch_tags)
            output["loss"].backward()
            optimizer.step()
            optimizer.zero_grad()
            losses.append(output["loss"].item())
        print(f"Epoch {epoch} is complete, Avg Loss = {np.mean(losses)}")
    return model

def evaluate_model(model, dataset, vocab, tagmap, batch_size):
    model.eval()
    def unpack_sequence_to_instance(prediction, gnd_list, tagmap, mask):
        pred_list = prediction.tolist()
        mask = mask.tolist()
        pred_tags, gnd_tags = [], []
        for i in range(len(mask)):
            for j in range(len(mask[0])):
                if mask[i][j] == 1:
                    pred_tags.append(tagmap.idx2tag[pred_list[i][j]])
                    gnd_tags.append(tagmap.idx2tag[gnd_list[i][j]])
            
        assert len(pred_tags) == len(gnd_tags)
        return pred_tags, gnd_tags
    
    with torch.no_grad():
        outputs = []
        y_pred, y_gnd = [], []
        for batch_sent, batch_tags in get_batch(dataset, vocab, tagmap, batch_size):
            torch_batch_sent, torch_batch_tags = torch.LongTensor(batch_sent).to(device), torch.LongTensor(batch_tags).to(device)
            mask = torch_batch_sent != vocab.word2idx["<pad>"]
            output = model(torch_batch_sent, torch_batch_tags)
            predictions = output["out_sequence"].argmax(1)
            pred, gnd = unpack_sequence_to_instance(predictions, batch_tags, tagmap, mask)
            y_pred += pred
            y_gnd += gnd
        model.train()
        return classification_report(y_pred, y_gnd)

## LSTM model 

In [11]:
class LSTMModel(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_dim,
                 num_layers,
                 batch_first,
                 dropout,
                 num_directions,
                 device,
                 embedding_dim,
                 embedding_path,
                 vocab,
                 target_map):
        super(LSTMModel, self).__init__()
        embedding_reader = EmbeddingReader(embedding_dim,
                                           embedding_path,
                                           vocab)
        embedding_matrix = embedding_reader.get_embedding_matrix()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.lstm_encoder = LSTMEncoder(input_size,
                                        hidden_dim,
                                        num_layers,
                                        batch_first,
                                        dropout,
                                        bidirectional= True if num_directions == 2 else False,
                                        device=device)
        self.dropout = nn.Dropout(dropout)
        n_class = len(target_map.tag2idx)
        self.out = nn.Linear(num_directions * hidden_dim, n_class)
        self.criterion = nn.CrossEntropyLoss(reduction="none")
        self.vocab = vocab

        
    def forward(self, sentences, targets):
        mask = (sentences != self.vocab.word2idx["<pad>"]).float()
        lengths = torch.sum(mask, dim=1)
        embedding = self.embedding(sentences)
        dropped_embeddings = self.dropout(embedding)
        packed_embeddings = nn.utils.rnn.pack_padded_sequence(dropped_embeddings,
                                                                lengths,
                                                                batch_first=True,
                                                                enforce_sorted=False)
        output, (h_n, c_n) = self.lstm_encoder(packed_embeddings, lengths.shape[0])
        unpacked_output, lengths = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        out = self.out(unpacked_output)
        out = out.transpose(2, 1)
        output = {"loss": self.loss_fn(out, targets, mask), "out_sequence": out}
        return output

    def loss_fn(self, predicted, target, mask):
        loss = self.criterion(predicted, target)
        loss = (loss * mask).sum()
        nnz = (mask != 0).sum()
        return loss / nnz

## LSTM CRF Model

We initialize the CRF model in the init method.

```
        self.crf = ConditionalRandomField(n_class, label_encoding="BIO", idx2tag=tagmap.idx2tag)
          
```

We compute log likelihood of the CRF model in the forward pass.

```
        log_likelihood = self.crf(logits, target, mask)
```

In [12]:
class LSTMCRFModel(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_dim,
                 num_layers,
                 batch_first,
                 dropout,
                 num_directions,
                 device,
                 embedding_dim,
                 embedding_path,
                 vocab,
                 target_map):
        super(LSTMCRFModel, self).__init__()
        embedding_reader = EmbeddingReader(embedding_dim,
                                           embedding_path,
                                           vocab)
        embedding_matrix = embedding_reader.get_embedding_matrix()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.lstm_encoder = LSTMEncoder(input_size,
                                        hidden_dim,
                                        num_layers,
                                        batch_first,
                                        dropout,
                                        bidirectional= True if num_directions == 2 else False,
                                        device=device)
        self.dropout = nn.Dropout(dropout)
        n_class = len(target_map.tag2idx)
        self.out = nn.Linear(num_directions * hidden_dim, n_class)
        self.criterion = nn.CrossEntropyLoss(reduction="none")
        self.vocab = vocab
        self.crf = ConditionalRandomField(n_class, label_encoding="BIO", idx2tag=tagmap.idx2tag)

        
    def forward(self, sentences, targets=None):
        mask = (sentences != self.vocab.word2idx["<pad>"]).int()
        lengths = torch.sum(mask, dim=1)
        embedding = self.embedding(sentences)
        dropped_embeddings = self.dropout(embedding)
        packed_embeddings = nn.utils.rnn.pack_padded_sequence(dropped_embeddings,
                                                                lengths,
                                                                batch_first=True,
                                                                enforce_sorted=False)
        output, (h_n, c_n) = self.lstm_encoder(packed_embeddings, lengths.shape[0])
        unpacked_output, lengths = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        out = self.out(unpacked_output)
        
        best_tag_sequence = self.crf.best_viterbi_tag(out, mask)
        
        class_probabilities = out * 0.0
        for i, instance_tags in enumerate(best_tag_sequence):
            for j, tag_id in enumerate(instance_tags[0][0]):
                class_probabilities[i, j, int(tag_id)] = 1        

        output = {"out_sequence": class_probabilities.transpose(2, 1)}
        
        
        if targets is not None:
            output["loss"] = self.loss_fn(out, targets, mask)
        return output

    def loss_fn(self, logits, target, mask):
        log_likelihood = self.crf(logits, target, mask)
        return -log_likelihood / logits.shape[0]

In [13]:
vocab = build_vocab(train_sents)
tagmap = build_tagmap(train_sents)

In [27]:
batch_size = 32
embedding_dim = 300
input_size = embedding_dim
hidden_dim = 64
num_layers = 2
learning_rate = 0.01
batch_first = True
dropout = 0.1
num_directions = 2
num_epochs = 10
embedding_path = "/users/talurj/Downloads/cc.en.300.bin" # Path to fasttext embedding
device='cpu'

In [28]:
lstm_model = LSTMModel(input_size,
                       hidden_dim,
                       num_layers,
                       batch_first,
                       dropout,
                       num_directions,
                       device,
                       embedding_dim,
                       embedding_path,
                       vocab,
                       tagmap)




In [29]:
lstm_model = train_model(lstm_model, train_sents, num_epochs, learning_rate, vocab, tagmap, batch_size)
print(evaluate_model(lstm_model, test_sents, vocab, tagmap, batch_size))

Epoch 0 is complete, Avg Loss = 0.3515281883921203
Epoch 1 is complete, Avg Loss = 0.1767707201455973
Epoch 2 is complete, Avg Loss = 0.1331593554780972
Epoch 3 is complete, Avg Loss = 0.10572671983391047
Epoch 4 is complete, Avg Loss = 0.08466701770479652
Epoch 5 is complete, Avg Loss = 0.07157210325572454
Epoch 6 is complete, Avg Loss = 0.06412825188367588
Epoch 7 is complete, Avg Loss = 0.05590231993977851
Epoch 8 is complete, Avg Loss = 0.050978807256096735
Epoch 9 is complete, Avg Loss = 0.04405360433390771
              precision    recall  f1-score   support

       B-LOC       0.63      0.86      0.73       784
      B-MISC       0.57      0.63      0.60       307
       B-ORG       0.77      0.83      0.79      1295
       B-PER       0.72      0.89      0.80       593
       I-LOC       0.55      0.78      0.65       229
      I-MISC       0.60      0.70      0.64       476
       I-ORG       0.66      0.92      0.77       796
       I-PER       0.77      0.97      0.86      

In [30]:
lstm_crf_model = LSTMCRFModel(input_size,
                       hidden_dim,
                       num_layers,
                       batch_first,
                       dropout,
                       num_directions,
                       device,
                       embedding_dim,
                       embedding_path,
                       vocab,
                       tagmap)




In [None]:
lstm_crf_model = train_model(lstm_crf_model, train_sents, num_epochs, learning_rate, vocab, tagmap, batch_size)
print(evaluate_model(lstm_crf_model, test_sents, vocab, tagmap, batch_size))

### We can see that LSTM-CRF model improves macro avg f1 from 0.73 to 0.74