In [56]:
import os
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [57]:
# hardware acceleration
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)
print(device)

cuda


In [58]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 128 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 200
n_head = 6
n_layer = 4
dropout = 0.2

In [59]:
# get input
if not os.path.exists('input.txt'):
    import requests
    data = requests.get('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
    with open('input.txt', 'w') as f:
        f.write(data.text)
    print('finished downloading input data')
else:
    print('already have input data')

already have input data


In [60]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    print('n_chars:', len(text))

n_chars: 1115394


In [61]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print('vocab_size:', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab_size: 65


In [62]:
# create mapping from characters to integers
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # takes a string: outputs a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # takes a list of integers, output a string

In [63]:
# encode text
data = torch.tensor(encode(text), dtype=torch.long)

# create training and validation splits
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [64]:
# loads data
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]) # stack along dim 0
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

xb, yb = get_batch('train')
print(xb.shape, yb.shape)

torch.Size([64, 128]) torch.Size([64, 128])


In [65]:
# estimate loss by taking an average loss over several batches
@torch.no_grad()
def estimate_loss(model, eval_iters=100):
    out = {}
    model.eval() # set model to eval phase
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean().item()
    model.train() # set model to train phase
    return out

---
### Precursor for Self-Attention
In an *auto-regressive* model, we want to model each time step in terms of its previous time steps. However, we do not want each time step to be dependent on any future time step as our model is tasked with predicting future time steps. One way to model this relationship is by writing the channels of each time step as a linear combination, henceforth called *aggregation*, of the channels corresponding to the previous time steps. 

We can easily do this by multiplying each sample, which is $[T, C]$ matrix, by a *lower triangular* weighted matrix $\mathrm{L}[T, T]$ whose rows sum to $1$. Initially, the weightings will be uniform, so the aggregation is really just the mean of previous channels. However, the weightings will later be learned by the model. The matrix multiplication $\mathrm{L} \times x$ has the same dimension as $x$, however each row of $x$ corresponding to the channels of each time step is now a weighted average of itself and the channels of the previous time steps.

eg: $$\begin{bmatrix}1.0 & 0.0 & 0.0 \\ 0.5 & 0.5 & 0.0 \\ 0.33 & 0.33 & 0.33\end{bmatrix} \times \begin{bmatrix}a_1 & a_2 \\ b_1 & b_2\\ c_1 & c_2\end{bmatrix}=\begin{bmatrix}a_1 & a_2 \\ 0.5a_1 + 0.5b_1 & 0.5a_2 + 0.5b_2 \\ 0.33a_1 + 0.33b_1 + 0.33c_1 & 0.33a_2 + 0.33b_2 + 0.33c_2\end{bmatrix}$$

Although this does model each time step as a linear combination of its previous time steps, it does not retain any knowledge of the sequence, which makes it not ideal at its current state. In general, an Attention mechanism does not retain any spatial information, which is why a position embedding must be used for auto-regressive models.

In [66]:
# when we encode our vocabulary, our batch samples xb become a [B,T,C] tensor
# where each sample is in a batch of size B containing.
# each sample is a time varying sequence of length T (tokens)
# and each time step contains a channel of information with length C (where C is dependent on the embedding)
B, T, C = 4, 8, 2 # batch, time, channel

x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [67]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow = wei @ x # xbow short for x bag-of-words which refers to a model that disregards ordering

In [68]:
# Alternative calculation for x bag-of-words which will be useful for self-attention
tril = torch.tril(torch.ones(T, T))
wei = torch.ones((T, T))
wei = wei.masked_fill(tril == 0, float('-inf')) # fills upper triangle with -inf
wei = F.softmax(wei, dim=1) # exponentiate and normalize which will results in the same matrix as before
xbow = wei @ x
print(xbow.shape)

torch.Size([4, 8, 2])


### Attention
The main idea behind attention is that the weights $W$ of the expression relating the tokens $X$ with one another, $W \times X$, should be learned by the model.
**Attention** does this by doing the following.
1. From each input token, whose count is denoted by `block_size`, `context_size`, or `T`, we create a *query* and *key* vector (we will use a simple linear layer, however there may be advantages to using more complex models to generate the key and query). The query vector, in abstract terms, describe a 'question' asked by that token. The key vector describes an 'answer'.
2. We then get *affinities* between two different tokens by taking a dot product between their query and key vector.
$$\bf w = \bf q \times \bf k^T$$
Of course this operation is vectorized for all possible pairs by matrix multiplication.

$$W = Q \times K^T$$
So $W$ represents the relationship between each token in the context. Larger values indicate two tokens whose query and key share some significant relationship.

However, we do not want past tokens interacting with future tokens, so we will use the lower triangular technique established earlier on $W$. So for the *first* token in the context, its only relationship is with itself; this will always be the case.

Then, we will softmax $W$ so that it becomes a *weighted* matrix, which we can aggregate with $X$. These values in $W$ are called **attention-scores** which tell us how much *attention* a token should be giving each token. 

However, we will not actually *attend* to the token embeddings directly. Rather, we will map the token embeddings to value vectors, which we will then attend to. There are several reasons to do this:
1. We may want to map the embedding vector to a lower dimension. This is especially useful for **multi-head attention**, which runs several heads in parallel whose results we then concatenate. So if we use $6$ heads with no dimensionality reduction, then the output of such a layer will be `6*n_embd` which is too large. In the paper, $Q,K,V$ maps to a `n_embd//n_heads` space.
2. A value *projection* allows each head to learn a perhpas more useful representation of each token for their own specific task.

$$\mathrm{Attention}(Q,K,V) = \mathrm{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$
where $d_k$ is the dimension of the key vectors. We additionally divide by $\sqrt{d_k}$, because for large values of $d_k$ the variance of $QK^T$ grows very large, pushing the softmax function into regions where the gradient is extremely small. 

Attention is so effective because it does not favor any token more than the other, which is a problem of previous models for sequential data, like RNN.

In [69]:
#B, T ,C = batch_size, block_size, vocab_size
B,T,C = 32, 8, 16
x = torch.randn(B,T,C)

# A single Head perform self-attention 
head_size = C // 1
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # [B, T, hs]
q = query(x) # [B, T, hs]
wei = q @ k.transpose(-2, -1) # [B,T,hs] @ [B,hs,T] = [B,T,T]

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf')) 
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v * (head_size**-0.5) # [B,T,T] @ [B,T,hs] = [B,T,hs]
print(out.shape)

torch.Size([32, 8, 16])
torch.Size([32, 8, 16])


In [55]:
print(wei[0])

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2858, 0.7142, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.8368, 0.1143, 0.0489, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0493, 0.3293, 0.3197, 0.3017, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1742, 0.3353, 0.0838, 0.2910, 0.1157, 0.0000, 0.0000, 0.0000],
        [0.0025, 0.0048, 0.9139, 0.0244, 0.0172, 0.0373, 0.0000, 0.0000],
        [0.0492, 0.0300, 0.0917, 0.0390, 0.2185, 0.3188, 0.2528, 0.0000],
        [0.5824, 0.0853, 0.0200, 0.0322, 0.0369, 0.1833, 0.0238, 0.0361]],
       grad_fn=<SelectBackward0>)


---
### Transformer Model
m

In [15]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [16]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd) # head_size * num_heads = n_embds SEE: Block
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # [B,T,n_embds]
        out = self.dropout(self.proj(out))
        return out

In [17]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [18]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [19]:
class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters()), 'parameters')

1973665 parameters


In [20]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(m)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.2057, val loss 4.2089
step 500: train loss 1.9105, val loss 2.0045
step 1000: train loss 1.5925, val loss 1.7777
step 1500: train loss 1.4796, val loss 1.6743
step 2000: train loss 1.4117, val loss 1.6086
step 2500: train loss 1.3694, val loss 1.5855
step 3000: train loss 1.3422, val loss 1.5666
step 3500: train loss 1.3110, val loss 1.5522
step 4000: train loss 1.2933, val loss 1.5334
step 4500: train loss 1.2699, val loss 1.5194
step 4999: train loss 1.2626, val loss 1.5138


In [22]:
# generate from the model
m.eval()
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
m.train();
#open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens


ive. Come, being to strange all the Lord.
My foe worship, sweet fledly way my polach
Master here baseness breath out where art what thou hast nothing.

BENVOLIO:
Lords, I do yet her; consul. Why, so I will lay a bawd
tender of court? sir, the garlor penfrance,
Which it is rankness to make upon my cousin
Of this gracious belove to love were senself.

Post:
O fear
Whom way over
I meet you not another?

First Servingman:
No, within a peace fortune liqual
With a dest one with that I'll any sake them
