Following [this tutorial](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html)

In [1]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)


<torch._C.Generator at 0x7f1413835f50>

In [2]:
embedding_dim = 3
hidden_size = 3
sequence_length = 5

lstm = nn.LSTM(embedding_dim, hidden_size)
inputs = [torch.randn(1, embedding_dim) for _ in range(sequence_length)]

hidden = (torch.randn(1, 1, embedding_dim), torch.randn(1, 1, embedding_dim))


for i in inputs:
    out, hidden = lstm(i.view(1, 1, -1), hidden)

In [3]:
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, (h_n, c_n) = lstm(inputs, hidden)
print(out)
print(h_n)
print(c_n)

tensor([[[-0.0187,  0.1713, -0.2944]],

        [[-0.3521,  0.1026, -0.2971]],

        [[-0.3191,  0.0781, -0.1957]],

        [[-0.1634,  0.0941, -0.1637]],

        [[-0.3368,  0.0959, -0.0538]]], grad_fn=<MkldnnRnnLayerBackward0>)
tensor([[[-0.3368,  0.0959, -0.0538]]], grad_fn=<StackBackward0>)
tensor([[[-0.9825,  0.4715, -0.0633]]], grad_fn=<StackBackward0>)


# LSTM for Part-of-Speech Tagging

Yoinked the UniversalDependencies training data set from [here](https://github.com/UniversalDependencies/UD_English-GUM)

In [4]:
# Load training and testing data

from typing import List, Tuple, Callable, Dict
import conllu


_training_file = "./data/en_gum-ud-train.conllu"
_testing_file = "./data/en_gum-ud-test.conllu"


def _load_data(f: str) -> List[Tuple[List[str], List[str]]]:
    with open(f) as f:
        data = conllu.parse(f.read())

    return [
        list(zip(
            *[(t["form"], t["upos"])
            for t in token_list]
        ))
        for token_list in data
    ]

training_data = _load_data(_training_file)
testing_data = _load_data(_testing_file)

# define word_to_idx, tag_to_idx

# very memory inefficient, however it's ok because we're only 
# dealing with a little bit of data
vocab = set(
    [x for words, _ in training_data for x in words]
    + [x for words, _ in testing_data for x in words]
)
tags = set(
    [x for _, tags in training_data for x in tags]
    + [x for _, tags in testing_data for x in tags]
)
chars = set()

for words, _ in training_data:
    for word in words:
        for letter in word:
            if not letter.isspace():
                chars.add(letter)

for words, _ in testing_data:
    for word in words:
        for letter in word:
            if not letter.isspace():
                chars.add(letter)

vocab_size = len(vocab)
tag_count = len(tags)
char_count = len(chars)

word_to_idx = {w: i for i, w in enumerate(vocab)}
tag_to_idx = {t: i for i, t in enumerate(tags)}
char_to_idx = {c: i for i, c in enumerate(chars)}


def prep_sequence(sent: List[str], to_idx: Callable[[str], int]) -> torch.Tensor:
    return torch.tensor(list(map(to_idx.get, sent)), dtype=torch.long)

def prep_data(
    words: List[str],
    tags: List[str],
    word_to_idx: Dict[str, int],
    tag_to_idx: Dict[str, int],
    char_to_idx: Dict[str, int],
) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
    word_idx = prep_sequence(words, word_to_idx)
    tag_idx = prep_sequence(tags, tag_to_idx)
    char_idx_list = [
        prep_sequence(list(w), char_to_idx)
        for w in words
    ]
    return word_idx, char_idx_list, tag_idx

In [31]:
class LstmPosTagger(nn.Module):
    def __init__(
        self,
        embedding_dim: int,
        hidden_size: int,
        vocab_size: int,
        tag_count: int,
        char_count: int,
        char_embedding_dim: int,
        char_hidden_size: int,
        num_layers: int = 1,
    ):
        super().__init__()

        self._embedding_dim = embedding_dim
        self._hidden_size = hidden_size
        self._vocab_size = vocab_size
        self._tag_count = tag_count
        self._num_layers = num_layers
        self._char_count = char_count
        self._char_embedding_dim = char_embedding_dim
        self._char_hidden_size = char_hidden_size

        self.emb = nn.Embedding(
            num_embeddings=self._vocab_size,
            embedding_dim=self._embedding_dim,
        )

        self.char_emb = nn.Embedding(
            num_embeddings=self._char_count,
            embedding_dim=self._char_embedding_dim,
        )

        self.char_lstm = nn.LSTM(
            input_size=self._char_embedding_dim,
            hidden_size=self._char_hidden_size,
            num_layers=self._num_layers,
        )

        self.lstm = nn.LSTM(
            input_size=self._embedding_dim + self._char_hidden_size,
            hidden_size=self._hidden_size,
            num_layers=self._num_layers,
        )

        self.fc = nn.Linear(
            in_features=self._hidden_size,
            out_features=self._tag_count,
        )

    def forward(
        self,
        x: torch.Tensor,  # (seq_length, batch_size, 1)
        x_chars: torch.Tensor,
    ):
        x = self.emb(x)  # (seq_length, batch_size, emb_dim)

        seq_length, _ = x.size()
        x_combined = torch.zeros(
            (seq_length, self._embedding_dim + self._char_hidden_size),
            device=x.device,
        )
        x_combined[:, :self._embedding_dim] = x

        for i, x_word in enumerate(x_chars):
            _, (h, _) = self.char_lstm(self.char_emb(x_word))
            x_combined[i, self._embedding_dim:] = h[-1]


        all_hidden_states, _ = self.lstm(x_combined)
        # all_hidden_states (seq_length, batch_size, hidden_size)

        output = self.fc(all_hidden_states)  # (seq_length, batch_size, tag_count)
        return output


def calculate_accuracy(
    model: LstmPosTagger,
    data: List[Tuple[List[str], List[str]]],
    word_to_idx: Dict[str, int],
    tag_to_idx: Dict[str, int],
) -> float:
    total = 0
    correct = 0
    with torch.no_grad():
        for x_raw, tags_raw in data:
            x, x_chars, tags = prep_data(
                words=x_raw,
                tags=tags_raw,
                word_to_idx=word_to_idx,
                tag_to_idx=tag_to_idx,
                char_to_idx=char_to_idx,
            )
            output = model(x, x_chars)
            logits = F.softmax(output, -1)
            _, tags_pred = torch.max(logits, dim=-1)
            correct += (tags_pred == tags).sum().item()
            total += x.size(0)
    return correct / total


In [34]:
EMBEDDING_DIM = 30
CHAR_EMBEDDING_DIM = 5
HIDDEN_SIZE = 40
CHAR_HIDDEN_SIZE = 5
LEARNING_RATE = 0.01
N_EPOCHS = 25


In [35]:
model = LstmPosTagger(
    embedding_dim=EMBEDDING_DIM,
    hidden_size=HIDDEN_SIZE,
    vocab_size=vocab_size,
    tag_count=tag_count,
    char_count=char_count,
    char_embedding_dim=CHAR_EMBEDDING_DIM,
    char_hidden_size=CHAR_HIDDEN_SIZE,
)
loss_function = F.cross_entropy
optimiser = optim.SGD(model.parameters(), lr=LEARNING_RATE)

n_training_points = len(training_data)

for epoch in range(N_EPOCHS):
    cum_loss = 0
    for x_raw, tags_raw in training_data:
        x, x_chars, tags = prep_data(
            words=x_raw,
            tags=tags_raw,
            word_to_idx=word_to_idx,
            tag_to_idx=tag_to_idx,
            char_to_idx=char_to_idx,
        )
        optimiser.zero_grad()
        tags_pred = model(x, x_chars)
        loss = loss_function(tags_pred, tags)
        loss.backward()
        optimiser.step()
        cum_loss += loss.item()

    train_acc = calculate_accuracy(
        model,
        training_data,
        word_to_idx,
        tag_to_idx,
    )
    test_acc = calculate_accuracy(
        model,
        testing_data,
        word_to_idx,
        tag_to_idx,
    )
    print(
        f"epoch: {epoch}, "
        f"loss: {cum_loss/n_training_points: 0.2f}, "
        f"test acc: {test_acc:0.2f}, "
        f"train acc: {train_acc:0.2f}"
    )


In [27]:
x_idx = prep_sequence(x_raw, word_to_idx)
x = model.emb(x_idx)

In [44]:
seq, _ = x.size()
x_comb = torch.zeros((seq, EMBEDDING_DIM + 5))

In [45]:
x_comb[:, :EMBEDDING_DIM] = x