# NanoGPT walkthrough

This notebook follows the [Let's build GPT: from scratch, in code, spelled out](https://www.youtube.com/watch?v=kCc8FmEb1nY&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=8) video.

## Get dataset

In [7]:
# Inspect dataset
with open('dataset/emma.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [8]:
# Dataset length
print(f"Dataset length (number of characters): {len(text)}")

Dataset length (number of characters): 880425


## Encoder/decoder

In [10]:
# Get vocabulary size (unique characters)
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Vocab: {''.join(chars)}")
print(f"Vocab size: {vocab_size}")

Vocab: 
 !&(),-.01234678:;?ABCDEFGHIJKLMNOPQRSTUVWXY[]_abcdefghijklmnopqrstuvwxyzàéêï—‘’“”
Vocab size: 83


In [11]:
# Create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("jane austen"))

[57, 48, 61, 52, 1, 48, 68, 66, 67, 52, 61]


## Encode dataset

In [14]:
## Encode the entire dataset and store it in a torch.Tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([880425]) torch.int64


## Split dataset into `train` and `validation` sets

In [17]:
# First 90% of dataset will be `train`; the rest is `val`
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(f"Dataset size: {len(data)}")
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")

Dataset size: 880425
Training set size: 792382
Validation set size: 88043


## Context window

In [18]:
block_size = 8

In [19]:
# A block of 9 characters actually contains 8 examples

# Example:
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"Input {context} ==> Output {target}")

Input tensor([24]) ==> Output 60
Input tensor([24, 60]) ==> Output 60
Input tensor([24, 60, 60]) ==> Output 48
Input tensor([24, 60, 60, 48]) ==> Output 0
Input tensor([24, 60, 60, 48,  0]) ==> Output 0
Input tensor([24, 60, 60, 48,  0,  0]) ==> Output 49
Input tensor([24, 60, 60, 48,  0,  0, 49]) ==> Output 72
Input tensor([24, 60, 60, 48,  0,  0, 49, 72]) ==> Output 1


## Batching

In [38]:
# How many independent sequences will be processed in parallel
batch_size = 4

def get_batch(split='train', batch_size=4, block_size=8):
    """
    Generates a small batch of data with inputs x and targets y
    """
    data = train_data if split == 'train' else val_data

    # Create a tensor of randint, with shape [batch_size];
    # this is where we start the training data
    ix = torch.randint(len(data) - block_size, (batch_size,))

    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

# Example batch:
xx, yy = get_batch('train')
print("=== Inputs ===")
print(f"Shape: {xx.shape}")
print(xx)
print("=== Targets ===")
print(f"Shape: {yy.shape}")
print(yy)
print()

print('=== What this means ===')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xx[b, :t+1]
        target = yy[b,t]
        print(f"  Input {context.tolist()} ==> target {target}")

=== Inputs ===
Shape: torch.Size([4, 8])
tensor([[66,  1, 48, 61, 72, 70, 55, 52],
        [72,  1, 48, 66,  1, 66, 55, 52],
        [ 1, 81, 28,  1, 51, 62,  1, 61],
        [48, 56, 66, 52,  1, 49, 52, 72]])
=== Targets ===
Shape: torch.Size([4, 8])
tensor([[ 1, 48, 61, 72, 70, 55, 52, 65],
        [ 1, 48, 66,  1, 66, 55, 52,  1],
        [81, 28,  1, 51, 62,  1, 61, 62],
        [56, 66, 52,  1, 49, 52, 72, 62]])

=== What this means ===
  Input [66] ==> target 1
  Input [66, 1] ==> target 48
  Input [66, 1, 48] ==> target 61
  Input [66, 1, 48, 61] ==> target 72
  Input [66, 1, 48, 61, 72] ==> target 70
  Input [66, 1, 48, 61, 72, 70] ==> target 55
  Input [66, 1, 48, 61, 72, 70, 55] ==> target 52
  Input [66, 1, 48, 61, 72, 70, 55, 52] ==> target 65
  Input [72] ==> target 1
  Input [72, 1] ==> target 48
  Input [72, 1, 48] ==> target 66
  Input [72, 1, 48, 66] ==> target 1
  Input [72, 1, 48, 66, 1] ==> target 66
  Input [72, 1, 48, 66, 1, 66] ==> target 55
  Input [72, 1, 48, 6

## Bigram model

In [147]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [148]:
# Some notes on dimensions
print(f"n = vocab_size = {vocab_size}")
print(f"B = batch_size = {batch_size} = how many independent sequences are being processed at once")
print(f"T = time = length of the running sequence")
print(f"C = channel = {vocab_size} = size of the feature vector at each position = embedding dimension")
print(f"Right now C = vocab_size")

n = vocab_size = 83
B = batch_size = 32 = how many independent sequences are being processed at once
T = time = length of the running sequence
C = channel = 83 = size of the feature vector at each position = embedding dimension
Right now C = vocab_size


In [149]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Create "embedding" table.
        # - Usually a token's embedding carries semantic meaning, but in a
        #   bigram model, it just predicts "what comes next".
        # - In this lookup table, each token gets mapped to the logits of
        #   the next token.
        # - The lookup table is of dimension (n,n).
        # - nn.Embedding initializes with random values
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, indices, targets=None):
        # `indices` and `targets` are both (B,T) tensor of integers,

        # For each idx in `indices`, we fetch its corresponding logits;
        # this produces a (B,T,C) tensor
        logits = self.token_embedding_table(indices)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape

            # We want to flatten `logits` so that we have a total of B*T
            # feature vectors of length C.
            logits = logits.view(B*T, C)

            # Also flatten `targets` so that it contains B*T target outputs
            # for each of the feature vectors in `logits`.
            targets = targets.view(B*T)

            # Compute loss
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, indices, max_new_tokens):
        # `indices` is a (B,T) tensor of indices in the current context

        for _ in range(max_new_tokens):
            # Get predictions;
            # `logits` is (B,T,C)
            logits, loss = self(indices) # calls forward()

            # `logits` contains the logits for every index in `indices`,
            # but we actually only need the last time step in each batch
            logits = logits[:, -1, :] # becomes (B,C)

            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)

            # Sample from the probability distribution
            next_idx = torch.multinomial(probs, num_samples=1) # (B,1)

            # Append sampled index to the context for each batch
            indices = torch.cat((indices, next_idx), dim=1) # (B,T+1)

        return indices

### Run the model now without training

In [150]:
# Example:
# Run the model and see what it generates right now (it's not trained)
m = BigramLanguageModel(vocab_size)
logits, loss = m(xx, yy) # recall that xx and yy are a batch in the training set
print("Logits shape:", logits.shape)
print("Loss:", loss)

# Generate some output, starting with [0]
gen = m.generate(indices = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)
print("Generated:")
print(decode(gen[0].tolist()))

Logits shape: torch.Size([256, 83])
Loss: tensor(5.1565, grad_fn=<NllLossBackward0>)
Generated:

 p,]]4AzipzpàTBv?(1fPo1 4NW0ORnslK[NH;—Fef
gfo3DIKwq x&LQSCPmGqvfT,;Yh’CrB)OFTJ&W!éd:YV”MT681B’z1“zE


### Train the model

In [151]:
m = BigramLanguageModel(vocab_size)

# PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [152]:
# Use bigger batch size
batch_size = 32

# Train for some iterations
iterations = 50000
print_interval = 5000
for step in range(iterations):
    # Sample a batch of data
    xx, yy = get_batch('train', batch_size)

    # Evaluate loss
    logits, loss = m(xx, yy)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step == 0 or step == iterations-1 or (step+1) % print_interval == 0:
        print(f"Loss at step {step+1}: {loss.item()}")

Loss at step 1: 4.812380313873291
Loss at step 5000: 2.63284969329834
Loss at step 10000: 2.4653244018554688
Loss at step 15000: 2.539247989654541
Loss at step 20000: 2.39619779586792
Loss at step 25000: 2.4903721809387207
Loss at step 30000: 2.4283287525177
Loss at step 35000: 2.438443183898926
Loss at step 40000: 2.460904121398926
Loss at step 45000: 2.4296138286590576
Loss at step 50000: 2.4623610973358154


### Generate some text

In [153]:
# Generate some output, starting with [0]
gen = m.generate(indices = torch.zeros((1,1), dtype=torch.long), max_new_tokens=200)
print("Generated:")
print(decode(gen[0].tolist()))

Generated:

bo he atht Qu mmanine, at Mitr
Mr bico er ol, ry I am ashetun mofesimige he frionelk f ciopind, wis t s
asulle, e bed avequra cow se it s. Handy.”
Perejey. ther. dfalth ved, pecom thisint t grn
cimu w


## Introducing self-attention

We would like the tokens to start talking to each other.

Information only flows from previous context into the future. A token cannot talk to a future token.

In [154]:
# Toy example
B, T, C = 4, 8, 2 # batch, time, channels

x = torch.randn(B,T,C)
print(x.shape)
print(x[0])

torch.Size([4, 8, 2])
tensor([[-0.5892,  0.3504],
        [-1.8511,  1.7745],
        [ 0.2523,  0.7422],
        [-0.3869,  0.0273],
        [ 1.0037, -2.0112],
        [-0.0021,  0.2303],
        [-1.5618, -0.1691],
        [ 0.5601,  2.0332]])


### Self-attention by taking the average

In [155]:
# Let's start by taking just the *average* of all previous tokens + current token.
# i.e. xbow[b,t] = mean_{i<=t} x[b,i]

# xbow = x "bag of words"
# "bag of words" just means we are just taking the average

xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0) # average along `time` dimension => (C,)

print(xbow.shape)
print(xbow[0])

torch.Size([4, 8, 2])
tensor([[-0.5892,  0.3504],
        [-1.2201,  1.0625],
        [-0.7293,  0.9557],
        [-0.6437,  0.7236],
        [-0.3142,  0.1767],
        [-0.2622,  0.1856],
        [-0.4479,  0.1349],
        [-0.3219,  0.3722]])


### Trick using matrix multiplication

We can use matrix multiplication with a `wei` array to achieve the same effect of taking the average of all previous tokens.

In [156]:
wei = torch.tril(torch.ones(T, T))
wei

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [157]:
wei = wei / wei.sum(1, keepdims=True)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [158]:
xbow2 = wei @ x
xbow2[0]

# Note on wei @ x:
# - wei is (T,T) but x is (B,T,C)
# - matrix multiplication will create a B dimension for wei => (B, T, T)
# - the result will be (B,T,C)

# xbow2 will be identical to xbow

tensor([[-0.5892,  0.3504],
        [-1.2201,  1.0625],
        [-0.7293,  0.9557],
        [-0.6437,  0.7236],
        [-0.3142,  0.1767],
        [-0.2622,  0.1856],
        [-0.4479,  0.1349],
        [-0.3219,  0.3722]])

### Another way by using Softmax

In [159]:
# Start by initializing `wei` as all 0's
wei = torch.zeros((T,T))
wei

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [160]:
tril = torch.tril(torch.ones(T,T))
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [161]:
wei = wei.masked_fill(tril == 0, float('-inf'))
wei

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [162]:
wei = F.softmax(wei, dim=-1)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [163]:
xbow3 = wei @ x
xbow3[0]

# xbow3 should be identical to xbow3 and xbow

tensor([[-0.5892,  0.3504],
        [-1.2201,  1.0625],
        [-0.7293,  0.9557],
        [-0.6437,  0.7236],
        [-0.3142,  0.1767],
        [-0.2622,  0.1856],
        [-0.4479,  0.1349],
        [-0.3219,  0.3722]])

**Note:**

We will use softmax because when we do `wei = wei.masked_fill(tril == 0, float('-inf'))`, we can treat `-inf` as saying "these future tokens have no effect on the current token." By extension, the values before `-inf` don't all have to be 0 - these tokens can start talking to each other and take on different weights => self-attention!

### Self-attention!

Instead of just taking the average, we let tokens talk to each other.

In [164]:
B,T,C = 4,8,32
x = torch.randn(B,T,C)

In [165]:
# Previously we did
# wei = torch.zeros((T,T))

# But now we don't want this to be all uniform; instead,
# we want to be able to gather info from the past.

# Every single token at each position will emit 2 vectors: query + key.
# - Query: "what am I looking for"
# - Key: "what do I contain"
# We use dot product to get affinity between tokens,
# i.e. "my query" /dot "your key" => this becomes wei

In [166]:
#
# Single head perform self-attention
#

head_size = 16

key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B,T,16)
q = query(x) # (B,T,16)

value = nn.Linear(C, head_size, bias=False)
v = value(x) # (B,T,16)

# We can think of k, q, and v as follows:
# - k: "here's what I have"
# - q: "here's what I'm interested in"
# - v: "if you find me interesting, here's what I will communicate to you"

In [167]:
# Dot product of q and k (need to transpose the last 2 dimensions of k);
# this is the "affinity" between tokens
wei = q @ k.transpose(-2, -1) # (B,T,T)
wei[0]

tensor([[ 0.1468, -0.4214,  0.6370,  0.5305,  0.2232, -0.2876, -0.0923, -0.0155],
        [ 2.0289,  2.2027,  0.8110,  2.9605,  1.2613,  0.6308, -0.1368,  0.8316],
        [-0.7470,  0.8219,  1.7294, -0.5071,  0.6228, -0.9951, -0.3489, -0.4836],
        [ 0.8390,  1.4731,  1.0499,  1.2938, -0.7957, -0.4047, -0.7720,  0.2426],
        [-0.4214,  0.5541,  1.0076,  0.1041, -0.1491, -1.3850, -0.5942,  0.1684],
        [ 0.7089, -4.6129, -2.5074,  1.1034,  1.2286,  0.2548,  0.6364,  1.7539],
        [ 1.4424, -1.4872, -0.2297,  0.4801,  2.0341,  0.0399, -0.1356,  0.9952],
        [ 0.1095,  2.0939,  0.8306,  1.3074, -1.3823, -1.1200, -0.0429,  0.2455]],
       grad_fn=<SelectBackward0>)

In [168]:
tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4567, 0.5433, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0565, 0.2713, 0.6722, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1756, 0.3310, 0.2168, 0.2767, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0923, 0.2449, 0.3854, 0.1562, 0.1212, 0.0000, 0.0000, 0.0000],
        [0.2064, 0.0010, 0.0083, 0.3062, 0.3470, 0.1311, 0.0000, 0.0000],
        [0.2575, 0.0138, 0.0484, 0.0984, 0.4654, 0.0634, 0.0532, 0.0000],
        [0.0619, 0.4500, 0.1272, 0.2049, 0.0139, 0.0181, 0.0531, 0.0709]],
       grad_fn=<SelectBackward0>)

In [169]:
# Instead of:
# out = wei @ x

# here we do:
out = wei @ v
out.shape

torch.Size([4, 8, 16])

**Notes:**
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- Each example across batch dimension is processed completely independently.
- What we have here is a "decoder" attention block because it has triangular masking; this is usually used in autoregressive settings, like language modeling. There's also "encoder" attention block, which allows all tokens to communicate and is used in situations like sentiment analysis. In an "encoder" block, just remove the `tril` line that does masking.
- "Self-attention" just means that the keys and values are produced from the same source as queries (i.e. `x` in our case). In "cross-attention", the queries still get produced from `x`, but the keys and values come from some other external source (e.g. an encoder module).

### Scaled self-attention

In "scaled" self-attention, we further divide `wei` by **1/sqrt(head_size)**. This makes it so when input Q, K are unit variance, `wei` will be unit variance too. This ensures softmax will stay diffuse and not saturate too much (not converge towards one-hot vector).

In [170]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1)

print(k.var())
print(q.var())
print(wei.var()) # head_size

tensor(1.0119)
tensor(1.0122)
tensor(15.9142)


In [171]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

print(k.var())
print(q.var())
print(wei.var()) # 1

tensor(1.0559)
tensor(0.9742)
tensor(1.1256)


In [172]:
# Why is low variance good?

# low variance
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1))

# high variance:
# this will get too peaky; convergs to one hot
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 8, dim=-1))

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])


## Add self-attention to `BigramLanguageModel`

### Add self-attention module

In [173]:
class Head(nn.Module):
    """
    One head of self-attention.
    """
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        # `tril` is a "buffer", i.e. it's not a parameter of the module.
        # We have to call register_buffer on it.
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        # self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        
        # Compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        # wei = self.dropout(wei)
        
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v     # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

### Updating BigramLanguageModel

In [174]:
# Introduce new variable: number of embedding dimensions
n_embd = 32

In [175]:
# Some notes on dimensions again
print(f"n = vocab_size = {vocab_size}")
print(f"B = batch_size = {batch_size} = how many independent sequences are being processed at once")
print(f"T = time = length of the running sequence")
print(f"C = channel = {n_embd} = size of the feature vector at each position = embedding dimension")
print(f"** C will no longer be equal to vocab_size; it will be n_embd instead **")
print(f"n_embd = {n_embd} = number of embedding dimensions")

n = vocab_size = 83
B = batch_size = 32 = how many independent sequences are being processed at once
T = time = length of the running sequence
C = channel = 32 = size of the feature vector at each position = embedding dimension
** C will no longer be equal to vocab_size; it will be n_embd instead **
n_embd = 32 = number of embedding dimensions


In [179]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Create "embedding" table.
        # - Maps each token in the vocabulary to an embedding of dimension `n_embd`
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)

        # New: add position embedding table
        # - Each position in the block gets its own embedding vector
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        # New: add linear layer between embeddings (of dimension `n_embd`)
        # and the logits (dimension `vocab_size`)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # New: self-attention head
        self.sa_head = Head(n_embd)

    def forward(self, indices, targets=None):
        # `indices` and `targets` are both (B,T) tensor of integers,
        B, T = indices.shape

        # For each idx in `indices`, we need to fetch its corresponding logits:
        
        # (1) New: for each idx in `indices`, we first fetch its embedding
        token_emb = self.token_embedding_table(indices) # (B,T,C)
        
        # (2) New: create the position embedding for each position in the block
        pos_emb = self.position_embedding_table(torch.arange(T)) # (T,C)

        # (3) New: add the token embedding to position embedding
        # - this basically means we add the embedding for each idx in `indices` to
        #   the position embedding for its position in the block
        # - note the dimension broadcasting here
        x = token_emb + pos_emb  # (B,T,C)

        # (4) New: apply one head of self-attention
        x = self.sa_head(x)      # (B,T,C)
        
        # (5) New: we then fetch the logits using the lm_head layer
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape

            # We want to flatten `logits` so that we have a total of B*T
            # feature vectors of length C.
            logits = logits.view(B*T, C)

            # Also flatten `targets` so that it contains B*T target outputs
            # for each of the feature vectors in `logits`.
            targets = targets.view(B*T)

            # Compute loss
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, indices, max_new_tokens):
        # `indices` is a (B,T) tensor of indices in the current context

        for _ in range(max_new_tokens):
            # New: we need to crop the context; otherwise it won't
            # fit into our position_embedding_table
            indices_cropped = indices[:, -block_size:]
            
            # Get predictions;
            # `logits` is (B,T,C)
            logits, loss = self(indices_cropped) # calls forward()

            # `logits` contains the logits for every index in `indices`,
            # but we actually only need the last time step in each batch
            logits = logits[:, -1, :] # becomes (B,C)

            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)

            # Sample from the probability distribution
            next_idx = torch.multinomial(probs, num_samples=1) # (B,1)

            # Append sampled index to the context for each batch
            indices = torch.cat((indices, next_idx), dim=1) # (B,T+1)

        return indices

### Train the model

In [180]:
m = BigramLanguageModel(vocab_size)

# PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [181]:
batch_size = 32

# Train for some iterations
iterations = 50000
print_interval = 5000
for step in range(iterations):
    # Sample a batch of data
    xx, yy = get_batch('train', batch_size)

    # Evaluate loss
    logits, loss = m(xx, yy)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step == 0 or step == iterations-1 or (step+1) % print_interval == 0:
        print(f"Loss at step {step+1}: {loss.item()}")

Loss at step 1: 4.578811168670654
Loss at step 5000: 2.384617567062378
Loss at step 10000: 2.3340530395507812
Loss at step 15000: 2.326909065246582
Loss at step 20000: 2.2604215145111084
Loss at step 25000: 2.27081561088562
Loss at step 30000: 2.4316458702087402
Loss at step 35000: 2.3086352348327637
Loss at step 40000: 2.2617619037628174
Loss at step 45000: 2.273874044418335
Loss at step 50000: 2.303931474685669


### Generate some text

In [182]:
# Generate some output, starting with [0]
gen = m.generate(indices = torch.zeros((1,1), dtype=torch.long), max_new_tokens=200)
print("Generated:")
print(decode(gen[0].tolist()))

Generated:

“No, at tud,” and, was, as ur onok yon ta thearsoret ngt wanto was to bus wer, bellid wingth Jato ala sen mof
med laditoe beed. Weten. “Hairse
seak. 
buse marsurse
ssobyifitudm
t
uchas the serfir hill


## Adding multi-head self-attention

In [183]:
class MultiHeadAttention(nn.Module):
    """
    Multiple heads of self-attention in parallel.
    """
    def __init__(self, num_heads, head_size):
        super().__init__()
        # Create multiple heads
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self, x):
        # Concatenate the result of each head
        return torch.cat([h(x) for h in self.heads], dim=-1) # concatenate over the channel dimension

In [187]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Create "embedding" table.
        # - Maps each token in the vocabulary to an embedding of dimension `n_embd`
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)

        # Add position embedding table
        # - Each position in the block gets its own embedding vector
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        # Add linear layer between embeddings (of dimension `n_embd`)
        # and the logits (dimension `vocab_size`)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # New: self-attention heads
        # i.e. 4 heads of 8-dimensional self-attention
        self.sa_heads = MultiHeadAttention(4, n_embd // 4)

    def forward(self, indices, targets=None):
        # `indices` and `targets` are both (B,T) tensor of integers
        B, T = indices.shape

        # For each idx in `indices`, we need to fetch its corresponding logits:
        
        # (1) For each idx in `indices`, we first fetch its embedding
        token_emb = self.token_embedding_table(indices) # (B,T,C)
        
        # (2) Create the position embedding for each position in the block
        pos_emb = self.position_embedding_table(torch.arange(T)) # (T,C)

        # (3) Add the token embedding to position embedding
        # - this basically means we add the embedding for each idx in `indices` to
        #   the position embedding for its position in the block
        # - note the dimension broadcasting here
        x = token_emb + pos_emb  # (B,T,C)

        # (4) New: apply multi-head self-attention
        x = self.sa_heads(x)      # (B,T,C)
        
        # (5) We then fetch the logits using the lm_head layer
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape

            # We want to flatten `logits` so that we have a total of B*T
            # feature vectors of length C.
            logits = logits.view(B*T, C)

            # Also flatten `targets` so that it contains B*T target outputs
            # for each of the feature vectors in `logits`.
            targets = targets.view(B*T)

            # Compute loss
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, indices, max_new_tokens):
        # `indices` is a (B,T) tensor of indices in the current context

        for _ in range(max_new_tokens):
            # We need to crop the context; otherwise it won't
            # fit into our position_embedding_table
            indices_cropped = indices[:, -block_size:]
            
            # Get predictions;
            # `logits` is (B,T,C)
            logits, loss = self(indices_cropped) # calls forward()

            # `logits` contains the logits for every index in `indices`,
            # but we actually only need the last time step in each batch
            logits = logits[:, -1, :] # becomes (B,C)

            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)

            # Sample from the probability distribution
            next_idx = torch.multinomial(probs, num_samples=1) # (B,1)

            # Append sampled index to the context for each batch
            indices = torch.cat((indices, next_idx), dim=1) # (B,T+1)

        return indices

### Train the model

In [188]:
m = BigramLanguageModel(vocab_size)

# PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [189]:
batch_size = 32

# Train for some iterations
iterations = 50000
print_interval = 5000
for step in range(iterations):
    # Sample a batch of data
    xx, yy = get_batch('train', batch_size)

    # Evaluate loss
    logits, loss = m(xx, yy)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step == 0 or step == iterations-1 or (step+1) % print_interval == 0:
        print(f"Loss at step {step+1}: {loss.item()}")

Loss at step 1: 4.431545257568359
Loss at step 5000: 2.4055771827697754
Loss at step 10000: 2.2029716968536377
Loss at step 15000: 2.145434617996216
Loss at step 20000: 2.032653570175171
Loss at step 25000: 2.0246944427490234
Loss at step 30000: 1.9365017414093018
Loss at step 35000: 2.381767511367798
Loss at step 40000: 2.0720951557159424
Loss at step 45000: 1.8479046821594238
Loss at step 50000: 1.978032112121582


### Generate some text

In [190]:
# Generate some output, starting with [0]
gen = m.generate(indices = torch.zeros((1,1), dtype=torch.long), max_new_tokens=200)
print("Generated:")
print(decode(gen[0].tolist()))

Generated:

whathery the
soutles htime. Emma. Emmake, thoust ewng
att to to reljeser ther being gively caratill amorselway
thoust hermith inf willy mus.”

“He reas had of an tokes the being. Henif ith anvaning, e
