# Sequence Models

In [85]:
import torch
import torch.nn as nn
import torch.nn.functional as F

def num_parameters(model):
    """Count the number of trainable parameters in a model"""
    return sum(param.numel() for param in model.parameters() if param.requires_grad)

#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Getting started

Let's start with an input sequence.

In [75]:
sentence = "The quick brown fox jumped over the lazy dogs."

Let's turn it into numbers by using the Unicode code point for each character.

In [76]:
sentence_tensor = torch.tensor([[ord(x) for x in sentence]])
sentence_tensor

tensor([[ 84, 104, 101,  32, 113, 117, 105,  99, 107,  32,  98, 114, 111, 119,
         110,  32, 102, 111, 120,  32, 106, 117, 109, 112, 101, 100,  32, 111,
         118, 101, 114,  32, 116, 104, 101,  32, 108,  97, 122, 121,  32, 100,
         111, 103, 115,  46]])

In [77]:
def decode(x):
    return ''.join(chr(x) for x in x.numpy())
decode(sentence_tensor[0])

'The quick brown fox jumped over the lazy dogs.'

We'll make this an autoregressive language model, so our goal will be to predict the next character. So we'll need to shift the targets left, so each character should output the next one.

In [80]:
input_ids = sentence_tensor[:, :-1]
targets = sentence_tensor[:, 1:]
assert input_ids.shape == targets.shape

Now let's make those numbers into vectors using an embedding. Note that we're just going to use the random initialization right now; we're not yet training this model.

In [81]:
n_vocab = 256
emb_dim = 5
embedder = nn.Embedding(n_vocab, emb_dim)
embedder.weight.shape

torch.Size([256, 5])

Now we compute the embeddings for our string. **Make sure you can explain the shape of this result.**

In [82]:
embeddings = embedder(input_ids)
embeddings.shape

torch.Size([1, 45, 5])

Now we'll define the *output* linear layer. We'll *tie the weights* with the embedding layer.

In [83]:
lm_head = nn.Linear(emb_dim, n_vocab)
assert lm_head.weight.shape == embedder.weight.shape
lm_head.weight = embedder.weight

Here's what the output of the model will look like. We haven't trained anything yet, though, so this will be garbage.

In [84]:
x = embeddings # pretend that this is the model...
logits = lm_head(x)
logits.shape

torch.Size([1, 45, 256])

Then we'll compute the cross-entropy loss as usual:

In [89]:
loss = F.cross_entropy(logits.transpose(1, 2), targets, reduction='none')
loss.shape

torch.Size([1, 45])

## Feed-Forward Network

Here's the simplest model we can make: a multi-layer perceptron. Fill in the blanks here so our model has *one hidden layer* with a *relu activation* (`nn.ReLU`).

- The output should have the same dimensionality as the embeddings
- Set `n_hidden` so that there are about 180 parameters.

In [70]:
# n_hidden = ...
n_hidden = 16
mlp = nn.Sequential(
    # nn.Linear(in_features=..., out_features=n_hidden),
    nn.Linear(in_features=emb_dim, out_features=n_hidden),
    # nn...,
    nn.ReLU(),
    # nn...
    nn.Linear(in_features=n_hidden, out_features=emb_dim)
)

num_parameters(mlp)

181

In [71]:
output = mlp(embeddings)
output.shape

torch.Size([1, 46, 5])

How fast is this model?

In [72]:
%timeit mlp(embeddings)

44.1 µs ± 513 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## GRU

In [6]:
gru = nn.GRU(emb_dim, emb_dim, batch_first=True)

In [7]:
output, hidden = gru(embeddings)
output.shape, hidden.shape

(torch.Size([1, 46, 5]), torch.Size([1, 1, 5]))

Note that `hidden` is just another name for the output at the last state (since GRU is simple).

In [8]:
(output[:, -1, :] == hidden).all()

tensor(True)

In [9]:
%timeit gru(embeddings)

1.76 ms ± 61.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
num_parameters(gru)

180

## Convolution

In [25]:
conv = nn.Conv1d(emb_dim, emb_dim, kernel_size=7)

Unfortunately Conv expects its inputs to be (batch size, channel, sequence length), just being confusing.

In [18]:
embeddings_for_conv = embeddings.transpose(1, 2)
embeddings_for_conv.shape

torch.Size([1, 5, 46])

In [19]:
output = conv(embeddings_for_conv)

In [20]:
output.shape

torch.Size([1, 5, 43])

In [21]:
%timeit conv(embeddings_for_conv)

39.9 µs ± 3.27 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [26]:
sum(param.numel() for param in conv.parameters())

180

## Transformer

In [27]:
xformer = nn.TransformerEncoderLayer(
    d_model=emb_dim,
    nhead=1,
    dim_feedforward=emb_dim,
    batch_first=True)

In [28]:
output = xformer(embeddings)
output.shape

torch.Size([1, 46, 5])

In [29]:
%timeit xformer(embeddings)

592 µs ± 47.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [31]:
sum(param.numel() for param in xformer.parameters())

200