Following [this tutorial](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html)

In [18]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)


<torch._C.Generator at 0x7f7a3f7fdf70>

In [24]:
embedding_dim = 3
hidden_size = 3
sequence_length = 5

lstm = nn.LSTM(embedding_dim, hidden_size)
inputs = [torch.randn(1, embedding_dim) for _ in range(sequence_length)]

hidden = (torch.randn(1, 1, embedding_dim), torch.randn(1, 1, embedding_dim))


for i in inputs:
    out, hidden = lstm(i.view(1, 1, -1), hidden)

In [25]:
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, (h_n, c_n) = lstm(inputs, hidden)
print(out)
print(h_n)
print(c_n)

tensor([[[ 0.2490, -0.0525,  0.3253]],

        [[ 0.1655, -0.0304,  0.3348]],

        [[-0.1104, -0.1085,  0.7568]],

        [[-0.0148, -0.0855,  0.4162]],

        [[ 0.0703, -0.1089,  0.2071]]], grad_fn=<MkldnnRnnLayerBackward0>)
tensor([[[ 0.0703, -0.1089,  0.2071]]], grad_fn=<StackBackward0>)
tensor([[[ 0.2099, -0.3541,  0.9947]]], grad_fn=<StackBackward0>)


In [21]:
out.size()

torch.Size([5, 1, 3])

In [22]:
hidden.size()

AttributeError: 'tuple' object has no attribute 'size'

# LSTM for Part-of-Speech Tagging

Yoinked the UniversalDependencies training data set from [here](https://github.com/UniversalDependencies/UD_English-GUM)

In [85]:
# Load training and testing data

from typing import List, Tuple, Callable, Dict
import conllu


_training_file = "./data/en_gum-ud-train.conllu"
_testing_file = "./data/en_gum-ud-test.conllu"


def _load_data(f: str) -> List[Tuple[List[str], List[str]]]:
    with open(f) as f:
        data = conllu.parse(f.read())

    return [
        list(zip(
            *[(t["form"], t["upos"])
            for t in token_list]
        ))
        for token_list in data
    ]

training_data = _load_data(_training_file)
testing_data = _load_data(_testing_file)

# define word_to_idx, tag_to_idx

# very memory inefficient, however it's ok because we're only 
# dealing with a little bit of data
vocab = set(
    [x for words, _ in training_data for x in words]
    + [x for words, _ in testing_data for x in words]
)
tags = set(
    [x for _, tags in training_data for x in tags]
    + [x for _, tags in testing_data for x in tags]
)

vocab_size = len(vocab)
tag_count = len(tags)

word_to_idx = {w: i for i, w in enumerate(vocab)}
tag_to_idx = {t: i for i, t in enumerate(tags)}


def prep_sequence(sent: List[str], to_idx: Callable[[str], int]):
    return torch.tensor(list(map(to_idx.get, sent)), dtype=torch.long)

In [86]:
class LstmPosTagger(nn.Module):
    def __init__(
        self,
        embedding_dim: int,
        hidden_size: int,
        vocab_size: int,
        tag_count: int,
        num_layers: int = 1,
    ):
        super().__init__()

        self._embedding_dim = embedding_dim
        self._hidden_size = hidden_size
        self._vocab_size = vocab_size
        self._tag_count = tag_count
        self._num_layers = num_layers

        self.emb = nn.Embedding(
            num_embeddings=self._vocab_size,
            embedding_dim=self._embedding_dim,
        )

        self.lstm = nn.LSTM(
            input_size=self._embedding_dim,
            hidden_size=self._hidden_size,
            num_layers=self._num_layers,
        )

        self.fc = nn.Linear(
            in_features=self._hidden_size,
            out_features=self._tag_count,
        )

    def forward(
        self,
        x: torch.Tensor,  # (seq_length, batch_size, 1)
    ):
        x = self.emb(x)  # (seq_length, batch_size, emb_dim)
        all_hidden_states, (_final_hidden_state, _final_cell_state) = self.lstm(x)
        # all_hidden_states (seq_length, batch_size, hidden_size)

        output = self.fc(all_hidden_states)  # (seq_length, batch_size, tag_count)
        return output


def calculate_accuracy(
    model: LstmPosTagger,
    data: List[Tuple[List[str], List[str]]],
    word_to_idx: Dict[str, int],
    tag_to_idx: Dict[str, int],
) -> float:
    total = 0
    correct = 0
    with torch.no_grad():
        for x_raw, tags_raw in data:
            x = prep_sequence(x_raw, word_to_idx)
            tags = prep_sequence(tags_raw, tag_to_idx)
            output = model(x)
            logits = F.softmax(output, dim=-1)
            _, tags_pred = torch.max(logits, dim=-1)
            correct += (tags_pred == tags).sum().item()
            total += x.size(0)
    return correct / total



In [91]:
EMBEDDING_DIM = 30
HIDDEN_DIM = 40
LEARNING_RATE = 0.01
N_EPOCHS = 25


In [92]:
model = LstmPosTagger(
    embedding_dim=EMBEDDING_DIM,
    hidden_size=HIDDEN_DIM,
    vocab_size=vocab_size,
    tag_count=tag_count,
)
loss_function = F.cross_entropy
optimiser = optim.SGD(model.parameters(), lr=LEARNING_RATE)

n_training_points = len(training_data)

for epoch in range(N_EPOCHS):
    cum_loss = 0
    for x_raw, tags_raw in training_data:
        x = prep_sequence(x_raw, word_to_idx)
        tags = prep_sequence(tags_raw, tag_to_idx)

        optimiser.zero_grad()
        tags_pred = model(x)
        loss = loss_function(tags_pred, tags)
        loss.backward()
        optimiser.step()
        cum_loss += loss.item()

    train_acc = calculate_accuracy(
        model,
        training_data,
        word_to_idx,
        tag_to_idx,
    )
    test_acc = calculate_accuracy(
        model,
        testing_data,
        word_to_idx,
        tag_to_idx,
    )
    print(
        f"epoch: {epoch}, "
        f"loss: {cum_loss/n_training_points: 0.2f}, "
        f"test acc: {test_acc:0.2f}, "
        f"train acc: {train_acc:0.2f}"
    )


epoch: 0, loss:  2.25, test acc: 0.38, train acc: 0.37
epoch: 1, loss:  1.77, test acc: 0.51, train acc: 0.51
epoch: 2, loss:  1.50, test acc: 0.56, train acc: 0.56
epoch: 3, loss:  1.33, test acc: 0.59, train acc: 0.60
epoch: 4, loss:  1.21, test acc: 0.62, train acc: 0.62
epoch: 5, loss:  1.12, test acc: 0.63, train acc: 0.65
epoch: 6, loss:  1.06, test acc: 0.65, train acc: 0.66
epoch: 7, loss:  1.00, test acc: 0.66, train acc: 0.68
epoch: 8, loss:  0.96, test acc: 0.67, train acc: 0.69
epoch: 9, loss:  0.93, test acc: 0.68, train acc: 0.70
epoch: 10, loss:  0.90, test acc: 0.69, train acc: 0.70
epoch: 11, loss:  0.87, test acc: 0.69, train acc: 0.71
epoch: 12, loss:  0.84, test acc: 0.70, train acc: 0.71
epoch: 13, loss:  0.82, test acc: 0.70, train acc: 0.72
epoch: 14, loss:  0.80, test acc: 0.70, train acc: 0.72
epoch: 15, loss:  0.78, test acc: 0.71, train acc: 0.73
epoch: 16, loss:  0.77, test acc: 0.71, train acc: 0.73
epoch: 17, loss:  0.75, test acc: 0.71, train acc: 0.74
ep