In [1]:
import os
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# hardware acceleration
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)
print(device)

cuda


In [3]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 128 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 204
n_head = 6
n_layer = 6
dropout = 0.2

In [4]:
# get input
if not os.path.exists('input.txt'):
    import requests
    data = requests.get('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
    with open('input.txt', 'w') as f:
        f.write(data.text)
    print('finished downloading input data')
else:
    print('already have input data')

already have input data


In [5]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    print('n_chars:', len(text))

n_chars: 1115394


In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print('vocab_size:', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab_size: 65


In [7]:
# create mapping from characters to integers
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # takes a string: outputs a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # takes a list of integers, output a string

In [8]:
# encode text
data = torch.tensor(encode(text), dtype=torch.long)

# create training and validation splits
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
# loads data
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]) # stack along dim 0
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

xb, yb = get_batch('train')
print(xb.shape, yb.shape)

torch.Size([64, 128]) torch.Size([64, 128])


In [10]:
# estimate loss by taking an average loss over several batches
@torch.no_grad()
def estimate_loss(model, eval_iters=100):
    out = {}
    model.eval() # set model to eval phase
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean().item()
    model.train() # set model to train phase
    return out

---
### Motivation for Self-Attention
In an *auto-regressive* model, we want to model each time step in terms of its previous time steps. However, we do not want each time step to be dependent on any future time step as our model is tasked with predicting future time steps. One way to model this relationship is by writing the channels of each time step as a linear combination, henceforth called *aggregation*, of the channels corresponding to the previous time steps. 

We can easily do this by multiplying each sample, which is $[T, C]$ matrix, by a *lower triangular* weighted matrix $\mathrm{L}[T, T]$ whose rows sum to $1$. Initially, the weightings will be uniform, so the aggregation is really just the mean of previous channels. However, the weightings will later be learned by the model. The matrix multiplication $\mathrm{L} \times x$ has the same dimension as $x$, however each row of $x$ corresponding to the channels of each time step is now a weighted average of itself and the channels of the previous time steps.

eg: $$\begin{bmatrix}1.0 & 0.0 & 0.0 \\ 0.5 & 0.5 & 0.0 \\ 0.33 & 0.33 & 0.33\end{bmatrix} \times \begin{bmatrix}a_1 & a_2 \\ b_1 & b_2\\ c_1 & c_2\end{bmatrix}=\begin{bmatrix}a_1 & a_2 \\ 0.5a_1 + 0.5b_1 & 0.5a_2 + 0.5b_2 \\ 0.33a_1 + 0.33b_1 + 0.33c_1 & 0.33a_2 + 0.33b_2 + 0.33c_2\end{bmatrix}$$

Although this does model each time step as a linear combination of its previous time steps, it does not retain any knowledge of the sequence, which makes it not ideal at its current state. In general, an Attention mechanism does not retain any spatial information, which is why a position embedding must be used for auto-regressive models.

In [11]:
# when we encode our vocabulary, our batch samples xb become a [B,T,C] tensor
# where each sample is in a batch of size B containing.
# each sample is a time varying sequence of length T (tokens)
# and each time step contains a channel of information with length C (where C is dependent on the embedding)
B, T, C = 4, 8, 2 # batch, time, channel

x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [12]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow = wei @ x # xbow short for x bag-of-words which refers to a model that disregards ordering

In [13]:
# Alternative calculation for x bag-of-words which will be useful for self-attention
tril = torch.tril(torch.ones(T, T))
wei = torch.ones((T, T))
wei = wei.masked_fill(tril == 0, float('-inf')) # fills upper triangle with -inf
wei = F.softmax(wei, dim=1) # exponentiate and normalize which will results in the same matrix as before
xbow = wei @ x
print(xbow.shape)

torch.Size([4, 8, 2])


## Attention
The main idea behind attention is that the weights $W$ of the expression relating the tokens $X$ with one another, $W \times X$, should be learned by the model.
**Attention** does this by doing the following.
1. From each input token, whose count is denoted by `block_size`, `context_size`, or `T`, we create a *query* and *key* vector (we will use a simple linear layer, however there may be advantages to using more complex models to generate the key and query). The query vector, in abstract terms, describe a 'question' asked by that token. The key vector describes an 'answer'.
2. We then get *affinities* between two different tokens by taking a dot product between their query and key vector.
$$\bf w = \bf q \times \bf k^T$$
Of course this operation is vectorized for all possible pairs by matrix multiplication.

$$W = Q \times K^T$$
So $W$ is a $T\times T$ matrix representing the relationship between each token in the context. Larger values indicate two tokens whose query and key share some significant relationship.

However, we do not want past tokens interacting with future tokens, so we will use the lower triangular technique established earlier on $W$. eg: for the *first* token in the context, its only relationship is with itself; this will always be the case.

Then, we will softmax $W$ so that it becomes a *weighted* matrix, which we can aggregate with $X$. These values in $W$ are called **attention-scores** which tell us how much *attention* a token should be giving each token. 

However, we will not actually *attend* to the token embeddings directly. Rather, we will map the token embeddings to value vectors, which we will then attend to. There are several reasons to do this:
1. We may want to map the embedding vector to a lower dimension. This is especially useful for **multi-head attention**, which runs several heads in parallel whose results we then concatenate. If we were to use $6$ heads with no dimensionality reduction, then the output of such a layer will be `6*n_embd` which is too large. In the paper, $Q,K,V$ maps to a `n_embd//n_heads` space, so the computational cost is very similar to a single `n_embd` dimensional head.
2. A value *projection* allows each head to learn a perhaps more useful representation of each token for their own specific task.

$$\mathrm{Attention}(Q,K,V) = \mathrm{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$
where $d_k$ is the dimension of the key vectors. We additionally divide by $\sqrt{d_k}$, because for large values of $d_k$ the variance of $QK^T$ grows very large, pushing the softmax function into regions where the gradient is extremely small. 

#### Multi-Head Attention
Multi-head attention is beneficial because it allows each head to learn their own $Q,K,V$. So the information we are aggregating is more diverse than if we had used single head attention.

#### Attention Block
An attention block is a stack of layers starting with a multi-head attention layer followed by a multi-layer perceptron. 
First, each head in the multi-head layer performs self-attention, then the outputs are aggregated and learned by the MLP, which crucially provide non-linearity to the model. In abstract terms, the heads learn the relationship between tokens, and the MLP learns the meaning of such relationships with respect to the task.

#### Benefits of Attention
Attention is so effective because it does not favor any token more than the other, which is a problem of previous models for handling sequential data, like RNN. Each self-attention block also runs once sequentially, while a recursive layer runs $\mathcal O(n)$ times, where $n$ is the length of the context, `block_size`.

### Positional Encoding
Unlike convolutional layers and recurrent networks, attention blocks by themselves do not contain any information about the relative or absolute positions of each token, so we will add a *positional encoding* to the model. Hence, the model will also learn the positional relationship between different tokens in addition to their semantic  relationship learned by the *token embedding* layer. This layer is crucial for the transformer model. The positional encodings can be either learned or fixed. The original paper notes that both give similar results, but we will use *learned* encodings here.

### Encoder & Decoder Block
The attention block defined above is called a *decoder block*, because it masks the the self-attention layer to prevent tokens from interacting with subsequent tokens. Attention blocks that allow all all tokens to interact regardless of position is called the *encoder block*. 

The decoder block is named as such because it is auto-regressive. Thus its purpose is to decode the structure of a sequence so the model can predict the future of the sequence. On the other hand, an encoder block encodes the structure of the whole sequence to something more meaningful to the model, which can be used in something like sentiment analysis.

### Self-Attention & Cross-Attention
Self-Attention is when the query, keys, and values all come from the same source. Cross-Attention is when the key and values come from a different source from the query. So cross attention relates the tokens in one source to the tokens in another source. 

In the original paper, cross attention was used to translate between two languages. For example, the encoder would encode a sequence of French, then its output would be passed as key and value for the decoder block whose purpose was to predict english tokens. Here, the decoder block has contextual information from the previous english tokens in the predicted sequence, but it also has information from the *entire* French sequence. Usually special tokens like `<START>` and `<END>` are used to denote the start of a decode sequence and its end.

### Residual Connections
As the number of layers in a NN increase, the NN will often come across the **vanishing gradient problem**, which is when the gradients get progressively smaller as they are backpropagated through the network. In effect, the layers closer to the input begin to update very slowly or not at all, which prevents the network from learning effectively. This can happen for several reasons, 
1. Activation functions like $\tanh$ and sigmoid have derivatives that are very small in certain ranges (which is remedied by normalization layers or activation functions like ReLU).
2. For deeper networks, if the gradient of a node is less than $1$, then the gradient of the connected nodes in the previous layer shrinks. As the network grows deeper, the gradients will shrink  exponentially towards.

**Residual connections** use skip connections to allow gradients to flow more directly through the network. A skip connection is created by simply adding the ouput at one point of the network to the output of a deeper layer, provided that they both have the same dimensions. Recall that the backpropagation of addition is just the gradient of the current layer, so even if the gradient of $x$ in the *main connection* vanishes, the gradient being propogated through the *residual connection* (the connection made through the addition) is still meaningful. 

There are many methods to laying out *residual block*, but the one that works best in practice is to project the output of a block to the same dimension as $x$. Then we add $x$ to that projection, which establishes the residual connection. It may be beneficial to add a nonlinearity and or norm layer before or after the addition. In general, any stack of layers with a residual connection from the input to the output is called a **residual block**.

### Regularization
#### LayerNorm
All activation functions are centered about the origin. We want our activations to remain stable so we want to have, at least initially, a mean at $0$ so the activation is not biased towards any particular region. Furthermore, we want unit variance, so the weights aren't taking on extreme values early on. 

Recall that BatchNorm normalizes (at initialization) the values along the batch dimension eg: if the mini-batch is $X=[B, C]$, then $BN(X)$ normalizes about the $0$ dimension. LayerNorm normalizes the values along the $1$ dimension. In a Transformer, this means that the distribution of each token is normalized at initalization.

LayerNorm is preferrable over BatchNorm in a Transformer model.

#### Dropout
Dropout is a regularization technique that randomly sets a specified number of nodes in a layer to zero during each epoch. The masked weights do not learn for that epoch. However, the nodes that are still 'on' learn differently because of the mask. Abstractly, this forces the model to learn an ensemble of models which are then combined when no nodes are masked. This helps in preventing overfitting as it would take much much longer for the weights to learn the trining dataset under such conditions. 

#### Early Stopping
Early stopping ends training early when the change in the validation loss is smaller than a certain $\delta$.

In [14]:
#B, T ,C = batch_size, block_size, vocab_size
B,T,C = 32, 8, 16
x = torch.randn(B,T,C)

# A single Head perform self-attention 
head_size = C // 1
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # [B, T, hs]
q = query(x) # [B, T, hs]
wei = q @ k.transpose(-2, -1) # [B,T,hs] @ [B,hs,T] = [B,T,T]

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf')) 
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v * (head_size**-0.5) # [B,T,T] @ [B,T,hs] = [B,T,hs]
print(out.shape)

torch.Size([32, 8, 16])


In [15]:
print(wei[0])

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.7927, 0.2073, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1009, 0.1969, 0.7022, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0476, 0.1376, 0.1814, 0.6334, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1358, 0.5664, 0.0762, 0.0603, 0.1612, 0.0000, 0.0000, 0.0000],
        [0.0207, 0.2973, 0.2778, 0.2459, 0.0387, 0.1195, 0.0000, 0.0000],
        [0.1891, 0.1309, 0.0590, 0.1273, 0.3027, 0.0179, 0.1732, 0.0000],
        [0.2296, 0.0428, 0.0288, 0.0121, 0.0369, 0.1511, 0.0177, 0.4810]],
       grad_fn=<SelectBackward0>)


---
### Transformer Model
m

In [16]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [17]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd) # head_size * num_heads = n_embds SEE: Block
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # [B,T,n_embds]
        out = self.dropout(self.proj(out))
        return out

In [18]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [19]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        # add x for residual connection & apply LayerNorm before each stack
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [20]:
class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters()), 'parameters')

3061697 parameters


In [21]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(m)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.1663, val loss 4.1663
step 500: train loss 1.9024, val loss 1.9957
step 1000: train loss 1.5639, val loss 1.7436
step 1500: train loss 1.4437, val loss 1.6480
step 2000: train loss 1.3807, val loss 1.5906
step 2500: train loss 1.3294, val loss 1.5590
step 3000: train loss 1.2984, val loss 1.5477
step 3500: train loss 1.2709, val loss 1.5186
step 4000: train loss 1.2490, val loss 1.5073
step 4500: train loss 1.2231, val loss 1.5017
step 4999: train loss 1.2046, val loss 1.4853


In [22]:
# generate from the model
m.eval()
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
m.train();
#open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens


HASTINGS:
Fairly servant, I live a world,
But Isabel, the brave mally her violent
But, therefore at his waffords. Lay his Gaunt's learn:
His own invade?

JULIET:
Nay, then all wilt our house.

FLORIZEL:
I'll we give a lineast:
Go; you are going to what the earl of you:
Discover me respected
In safety. You have been not his innumation care.

AUTOLYCUS:
There's words, why deven thy hearts and this?

CORIOLANUS:
O, O Bianca, I am the countympanished;
Yet his beaul against this give; so long much hi


### Pretraining
In the pretraining stage, we train the decoder using some dataset. The result is some random word generator that just generates text its learned from the dataset (which is what we did).

### Finetuning
The finetuning stage follows from the pretraining stage and its purpose is to align the model to whatever task we want it to do, eg: rewrite a paragraph in a different tone, answer questions, interact with user, etc. This stage requires labeled data and several stages of supervised learning.