In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

A quick refresher on torch.zeros. The parameters we will be using will create vector with three elements.

In [2]:
test_input = torch.zeros(1, 3)
test_input_view = test_input.view(1, 1, -1)

print(test_input)
print('Shape:', test_input.shape)
print('ndim', test_input.ndim)

print('\n')
print(test_input_view)
print('Shape:', test_input_view.shape)
print('ndim', test_input_view.ndim)


tensor([[0., 0., 0.]])
Shape: torch.Size([1, 3])
ndim 2


tensor([[[0., 0., 0.]]])
Shape: torch.Size([1, 1, 3])
ndim 3


Create the LSTM.<br><br> Note: Pytorch has a random number generator (RNG) that changes state everytime a function or method that requires a random number is called. It turns out that the LSTM class uses random numbers. So, to get an LSTM object with the same state each time we test we will use the torch.manual_seed function. Note: this function must be called prior to every Pytorch function or method that uses random numbers if you want the same results each time. This is due to the fact that a call to the RNG changes the internal state of the RNG so that the next caller will not get the same results. That is why I chose not to use torch.randn() when creating the test inputs and inializing the hidden state. If I did I would have to call the torch.manual_seed function before every single call to torch.randn() which would be messy and is also hard to do in a comprehension which is used to create my sample inputs. You can get an object that represents the state of the random number generator by calling torch.get_rng_state().

In [3]:
# Create the LSTM. The first parameter is the input dimension, the second is the output dimension.
torch.manual_seed(42)
lstm = nn.LSTM(3, 3)

# Create the input. Here it is a list of length 5. Each element is a
inputs = [torch.zeros(1, 3) for _ in range(5)]

# Initialize the hidden state which is a tuple of tensors.
hidden = (torch.zeros(1, 1, 3),
          torch.zeros(1, 1, 3))

# Loop through the inputs.
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

print('out:', out)
print('\n')
print('hidden:', hidden)

out: tensor([[[0.1731, 0.0875, 0.0087]]], grad_fn=<StackBackward0>)


hidden: (tensor([[[0.1731, 0.0875, 0.0087]]], grad_fn=<StackBackward0>), tensor([[[0.3653, 0.1827, 0.0237]]], grad_fn=<StackBackward0>))


In [None]:
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)

print('out:', out)
print('\n')
print('hidden:', hidden)

### Part of Speech Tagging Sample

In [4]:
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
EPOCHS = 100
HIDDEN_DIM = 6
LR = 0.1

In [5]:
training_data = [
    ("The dog ate the apple".split(), ['Determiner', 'Noun', 'Verb', 'Determiner', 'Noun']),
    ("Everybody read that book".split(), ['Noun', 'Verb', 'Determiner', 'Noun'])
]

word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index

tag_to_ix = {'Determiner': 0, 'Noun': 1, 'Verb': 2}  # Assign each tag with a unique index
ix_to_tag = ['Determiner', 'Noun', 'Verb']  # Assign each tag with a unique index


def prepare_sequence(seq: list, word_to_ix: dict) -> torch.Tensor:
    idxs = [word_to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.int64)


def translate_predictions(tag_scores):
    predicted_indecies = torch.argmax(tag_scores, dim=1)
    predictions = [ix_to_tag[i] for i in predicted_indecies.numpy()]
    return predictions
    

print(word_to_ix)
print(tag_to_ix)
print(ix_to_tag)

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
{'Determiner': 0, 'Noun': 1, 'Verb': 2}
['Determiner', 'Noun', 'Verb']


In [6]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

### Create the Model, Loss Function, and Optimizer

In [None]:
torch.manual_seed(42)
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=LR)


### Predictions Before Training

In [11]:

def print_prediction(sentence: list) -> None:
    with torch.no_grad():
        inputs = prepare_sequence(sentence, word_to_ix)
        tag_scores = model(inputs)
        predictions = translate_predictions(tag_scores)
        print(sentence)
        print(predictions)
        print(tag_scores)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
print_prediction(training_data[0][0])

['The', 'dog', 'ate', 'the', 'apple']
['The', 'dog', 'ate', 'the', 'apple']
['Determiner', 'Noun', 'Verb', 'Determiner', 'Noun']
tensor([[-0.0259, -4.5027, -4.2358],
        [-4.5316, -0.0617, -3.0154],
        [-2.6551, -2.9622, -0.1301],
        [-0.1583, -3.9917, -2.0562],
        [-4.2915, -0.0241, -4.5954]])


### Train the Model

In [8]:
for epoch in range(EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    

### Predictions After Training

In [9]:
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    predictions = translate_predictions(tag_scores)
    print(training_data[0][0])
    print(predictions)
    print(tag_scores)

['The', 'dog', 'ate', 'the', 'apple']
['Determiner', 'Noun', 'Verb', 'Determiner', 'Noun']
tensor([[-0.0259, -4.5027, -4.2358],
        [-4.5316, -0.0617, -3.0154],
        [-2.6551, -2.9622, -0.1301],
        [-0.1583, -3.9917, -2.0562],
        [-4.2915, -0.0241, -4.5954]])
