# Sequence Models

In [92]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from IPython.display import display

def highlight_values(x):
    """Show a 2D tensor with background highlighted"""
    x = x.squeeze() # collapse singleton dimensions, like batch.
    assert len(x.shape) == 2, "Can only handle 2D data"
    display(pd.DataFrame(x.numpy()).style.background_gradient(axis=None))

def num_parameters(model):
    """Count the number of trainable parameters in a model"""
    return sum(param.numel() for param in model.parameters() if param.requires_grad)

def time_trial(model, embeddings, concat_on_axis=1):
    '''Time how long a forward pass of the model takes on embeddings, varying the sequence length.'''
    for i in range(5):
        num_reps = 2 ** i
        x = torch.cat([embeddings] * num_reps, axis=concat_on_axis)
        print(f'{num_reps} repetitions of the original sequence, shape = {tuple(x.shape)}')
        %timeit -r 3 model(x)


## Getting started

Let's start with an input sequence.

In [78]:
sentence = "The quick brown fox jumped over the lazy dogs."

Let's turn it into numbers by using the Unicode code point for each character.

In [79]:
sentence_tensor = torch.tensor([[ord(x) for x in sentence]])
sentence_tensor

tensor([[ 84, 104, 101,  32, 113, 117, 105,  99, 107,  32,  98, 114, 111, 119,
         110,  32, 102, 111, 120,  32, 106, 117, 109, 112, 101, 100,  32, 111,
         118, 101, 114,  32, 116, 104, 101,  32, 108,  97, 122, 121,  32, 100,
         111, 103, 115,  46]])

In [80]:
def decode(x):
    return ''.join(chr(x) for x in x.numpy())
decode(sentence_tensor[0])

'The quick brown fox jumped over the lazy dogs.'

We'll make this an autoregressive language model, so our goal will be to predict the next character. So we'll need to shift the targets left, so each character should output the next one.

In [81]:
input_ids = sentence_tensor[:, :-1]
targets = sentence_tensor[:, 1:]
assert input_ids.shape == targets.shape

Now let's make those numbers into vectors using an embedding. Note that we're just going to use the random initialization right now; we're not yet training this model.

In [82]:
n_vocab = 256
emb_dim = 5
embedder = nn.Embedding(n_vocab, emb_dim)
embedder.weight.shape

torch.Size([256, 5])

Now we compute the embeddings for our string. **Make sure you can explain the shape of this result.**

In [83]:
embeddings = embedder(input_ids)
embeddings.retain_grad()
embeddings.shape

torch.Size([1, 45, 5])

Now we'll define the *output* linear layer. We'll *tie the weights* with the embedding layer.

In [84]:
lm_head = nn.Linear(emb_dim, n_vocab)
assert lm_head.weight.shape == embedder.weight.shape
lm_head.weight = embedder.weight

Here's what the output of the model will look like. We haven't trained anything yet, though, so this will be garbage.

In [85]:
x = embeddings # pretend that this is the model...
logits = lm_head(x)
logits.shape

torch.Size([1, 45, 256])

Then we'll compute the cross-entropy loss as usual.

Note: we need to `transpose` the `logits` so that the time steps are on the last dimension, so the last dimensions line up with `targets`. I suspect PyTorch uses this convention because it could extend to 2D `targets` (e.g., images), but I admit I'm not entirely sure why.

In [86]:
loss = F.cross_entropy(logits.transpose(1, 2), targets, reduction='none')
loss.shape

torch.Size([1, 45])

## Feed-Forward Network

Here's the simplest model we can make: a multi-layer perceptron. Fill in the blanks here so our model has *one hidden layer* with a *relu activation* (`nn.ReLU`).

- The output should have the same dimensionality as the embeddings
- Set `n_hidden` so that there are about 180 parameters.

### Create the model

In [87]:
# n_hidden = ...
n_hidden = 16
mlp = nn.Sequential(
    # nn.Linear(in_features=..., out_features=n_hidden),
    nn.Linear(in_features=emb_dim, out_features=n_hidden),
    # nn...,
    nn.ReLU(),
    # nn...
    nn.Linear(in_features=n_hidden, out_features=emb_dim)
)

num_parameters(mlp)

181

### Check its output shape

In [88]:
output = mlp(embeddings)
output.shape

torch.Size([1, 45, 5])

### Check its speed

In [89]:
time_trial(mlp, embeddings)

1 repetitions of the original sequence, shape = (1, 45, 5)
46.7 µs ± 2.04 µs per loop (mean ± std. dev. of 3 runs, 10000 loops each)
2 repetitions of the original sequence, shape = (1, 90, 5)
49 µs ± 2.06 µs per loop (mean ± std. dev. of 3 runs, 10000 loops each)
4 repetitions of the original sequence, shape = (1, 180, 5)
50.6 µs ± 1.94 µs per loop (mean ± std. dev. of 3 runs, 10000 loops each)
8 repetitions of the original sequence, shape = (1, 360, 5)
55.3 µs ± 1.23 µs per loop (mean ± std. dev. of 3 runs, 10000 loops each)
16 repetitions of the original sequence, shape = (1, 720, 5)
71.1 µs ± 1.8 µs per loop (mean ± std. dev. of 3 runs, 10000 loops each)


### Check how gradients flow

In [93]:
# Recreate the embeddings
embeddings = embedder(input_ids)
embeddings.retain_grad() # Tell Torch we want to know what the gradients are here.
# Pass the embeddings through the model and language modeling head.
output = mlp(embeddings)
logits = lm_head(output)
# Compute the loss
loss = F.cross_entropy(logits.transpose(1, 2), targets, reduction='none')
# Let the model learn from one single character (the one at index 20)
loss[0, 20].backward()
# Show the results
highlight_values(embeddings.grad)
# Show how big the gradient is (the "norm") 
embeddings.grad.norm(dim=2)

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0


tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.2818, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]])

## GRU

In [14]:
gru = nn.GRU(emb_dim, emb_dim, batch_first=True)

In [15]:
output, hidden = gru(embeddings)
output.shape, hidden.shape

(torch.Size([1, 45, 5]), torch.Size([1, 1, 5]))

Note that `hidden` is just another name for the output at the last state (since GRU is simple).

In [16]:
(output[:, -1, :] == hidden).all()

tensor(True)

In [30]:
time_trial(gru, embeddings)

1 reps, shape = (1, 45, 5)
1.61 ms ± 9.9 µs per loop (mean ± std. dev. of 3 runs, 1000 loops each)
2 reps, shape = (1, 90, 5)
3.14 ms ± 32.1 µs per loop (mean ± std. dev. of 3 runs, 100 loops each)
4 reps, shape = (1, 180, 5)
6.51 ms ± 67.5 µs per loop (mean ± std. dev. of 3 runs, 100 loops each)
8 reps, shape = (1, 360, 5)
14.7 ms ± 92.2 µs per loop (mean ± std. dev. of 3 runs, 100 loops each)
16 reps, shape = (1, 720, 5)
32.6 ms ± 658 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


In [18]:
num_parameters(gru)

180

## Convolution

In [19]:
conv = nn.Conv1d(emb_dim, emb_dim, kernel_size=7)

Unfortunately Conv expects its inputs to be (batch size, *channel*, sequence length), where they think of embedding dimensions as being "channels" -- confusing. Fortunately, this is easily solved by a `transpose`.

In [20]:
embeddings_for_conv = embeddings.transpose(1, 2)
embeddings_for_conv.shape

torch.Size([1, 5, 45])

In [21]:
output = conv(embeddings_for_conv)

In [22]:
output.shape

torch.Size([1, 5, 39])

In [23]:
time_trial(conv, embeddings_for_conv, concat_on_axis=2)

1 reps, shape = (1, 5, 45)
29 µs ± 774 ns per loop (mean ± std. dev. of 3 runs, 10000 loops each)
2 reps, shape = (1, 5, 90)
28.5 µs ± 305 ns per loop (mean ± std. dev. of 3 runs, 10000 loops each)
4 reps, shape = (1, 5, 180)
29.8 µs ± 528 ns per loop (mean ± std. dev. of 3 runs, 10000 loops each)
8 reps, shape = (1, 5, 360)
32.6 µs ± 255 ns per loop (mean ± std. dev. of 3 runs, 10000 loops each)
16 reps, shape = (1, 5, 720)
35.7 µs ± 354 ns per loop (mean ± std. dev. of 3 runs, 10000 loops each)


In [24]:
sum(param.numel() for param in conv.parameters())

180

## Transformer

In [25]:
xformer = nn.TransformerEncoderLayer(
    d_model=emb_dim,
    nhead=1,
    dim_feedforward=emb_dim,
    batch_first=True)

In [26]:
output = xformer(embeddings)
output.shape

torch.Size([1, 45, 5])

In [29]:
time_trial(xformer, embeddings)

1 reps, shape = (1, 45, 5)
576 µs ± 7.28 µs per loop (mean ± std. dev. of 3 runs, 1000 loops each)
2 reps, shape = (1, 90, 5)
674 µs ± 6.94 µs per loop (mean ± std. dev. of 3 runs, 1000 loops each)
4 reps, shape = (1, 180, 5)
866 µs ± 45.3 µs per loop (mean ± std. dev. of 3 runs, 1000 loops each)
8 reps, shape = (1, 360, 5)
1.07 ms ± 24.6 µs per loop (mean ± std. dev. of 3 runs, 1000 loops each)
16 reps, shape = (1, 720, 5)
1.97 ms ± 109 µs per loop (mean ± std. dev. of 3 runs, 1000 loops each)


In [None]:
num_parameters(xformer)

200