# GPT From Scratch

Based of the video tutorial from Andrej Karpathy [Link](https://www.youtube.com/watch?v=kCc8FmEb1nY&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ)

In [3]:
# Load the dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O ../datasets/tinyshakespeare/input.txt

--2024-04-08 13:07:00--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘../datasets/tinyshakespeare/input.txt’


2024-04-08 13:07:00 (2.29 MB/s) - ‘../datasets/tinyshakespeare/input.txt’ saved [1115394/1115394]



In [10]:
# Install dependencies
!pip install torch torchvision

Collecting torch
  Using cached torch-2.2.2-cp312-none-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting torchvision
  Using cached torchvision-0.17.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting filelock (from torch)
  Using cached filelock-3.13.3-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy (from torch)
  Using cached sympy-1.12-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting jinja2 (from torch)
  Using cached Jinja2-3.1.3-py3-none-any.whl.metadata (3.3 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Collecting numpy (from torchvision)
  Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Collecting pillow!=8.3.*,>=5.3.0 (from torchvision)
  Using cached pillow-10.3.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.2 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Using cached MarkupSafe-

In [258]:
# Import dependencies

import torch
import torch.nn as nn
from torch.nn import functional as F


In [275]:
# Configuration

should_train_large_scale = False

if should_train_large_scale:
    batch_size = 64 # How many independent sequences to process in parallel
    block_size = 256 # The maximum context length for predictions
    num_heads = 6 # The number of attention heads
    max_epochs = 5000 # The maximum number of epochs to train for
    eval_interval = 500
    learning_rate = 3e-4
    eval_iters = 200
    num_embedding_dimensions = 384
    num_layers = 6
    dropout = 0.2
else:
    batch_size = 32 # How many independent sequences to process in parallel
    block_size = 8 # The maximum context length for predictions
    num_heads = 4 # The number of attention heads
    max_epochs = 5000 # The maximum number of epochs to train for
    eval_interval = 300
    learning_rate = 1e-3
    eval_iters = 200
    num_embedding_dimensions = 32
    num_layers = 3
    dropout = 0.1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on large scale: {should_train_large_scale}, Running on {device}")

torch.manual_seed(1337)

Training on large scale: False, Running on cpu


<torch._C.Generator at 0x120e23310>

In [276]:
with open('../datasets/tinyshakespeare/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print (f"Lenght of text: {len(text)}")

Lenght of text: 1115394


In [277]:
# Generate tokens (on a character level)
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [278]:
# Create mappings between characters and integer tokens
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s] # Convert a string to a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # Convert a list of integers to a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [279]:
data = torch.tensor(encode(text), dtype=torch.long)
print(f"data.shape: {data.shape}, data.dtype: {data.dtype}")
print(data[:1000])

# Split out data into training and validation
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

print(f"len(train_data): {len(train_data)}, len(val_data): {len(val_data)}")

assert len(train_data) == int(0.9*len(data))
assert len(val_data) == len(data) - len(train_data) 

data.shape: torch.Size([1115394]), data.dtype: torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  

In [280]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    bigram_lm.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = bigram_lm(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    bigram_lm.train()
    return out

In [281]:
# custom implementation for reference.
class LayerNorm1d:

    def __init__(self, dimensions, epsilon=1e-5):
        self.epsilon = epsilon
        self.gamma = torch.ones(dimensions)
        self.beta = torch.zeros(dimensions)

    def __call__(self, x):
        xmean = x.mean(1, keepdim=True) # batch mean
        xvar = x.var(1, keepdim=True) # batch variance
        xhat = (x - xmean) / torch.sqrt(xvar + self.epsilon) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta # scale and shift
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100)
x = module(x)
x.shape

x[:,0].mean(), x[:,0].std()

(tensor(0.1469), tensor(0.8803))

In [282]:
class Head(nn.Module):
    """ One head self attention """
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(num_embedding_dimensions, head_size, bias=False)
        self.query = nn.Linear(num_embedding_dimensions, head_size, bias=False)
        self.value = nn.Linear(num_embedding_dimensions, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (B, T, C)
        q = self.query(x) # (B, T, C)

        # compute attention scores ("affinities")
        weigths = q @ k.transpose(-2, -1) * C ** -0.5 # (B, T, C) @ (B, C, T) = (B, T, T)
        weigths = weigths.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        weigths = F.softmax(weigths, dim=-1) # (B, T, T)
        weigths = self.dropout(weigths)

        # Perform the weighted aggregation of the values
        v = self.value(x)
        out = weigths @ v # (B, T, T) @ (B, T, C) = (B, T, C)
        return out



In [283]:
# Multi-head attention model
class MultiHeadAttention(nn.Module):
    """ Multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(num_heads * head_size, num_embedding_dimensions)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.projection(out)
        out = self.dropout(out)
        return out

In [284]:
class FeedForward(nn.Module):
    """ A simple feed-forward network """

    def __init__(self, num_embeddings):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(num_embeddings, 4 * num_embeddings),
            nn.ReLU(),
            nn.Linear(4 * num_embeddings, num_embeddings),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [285]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, num_embeddings, num_heads) -> None:
        super().__init__()
        head_size = num_embeddings // num_heads
        self.self_attention = MultiHeadAttention(num_heads, head_size)
        self.feed_forward = FeedForward(num_embeddings)
        self.layer_norm_1 = nn.LayerNorm(num_embeddings)
        self.layer_norm_2 = nn.LayerNorm(num_embeddings)

    def forward(self, x):
        x = x + self.self_attention(self.layer_norm_1(x))
        x = x + self.feed_forward(self.layer_norm_2(x))
        return x



In [286]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, num_embedding_dimensions)
        self.position_embedding_table = nn.Embedding(block_size, num_embedding_dimensions)
        self.blocks = nn.Sequential(*[Block(num_embedding_dimensions, num_heads) for _ in range(num_layers)])
        self.layer_norm_final = nn.LayerNorm(num_embedding_dimensions)
        self.lm_head = nn.Linear(num_embedding_dimensions, vocab_size)
        
    def forward(self, index, targets = None):
        B, T = index.shape

        # index and targets are both (Batch, Time) tensor of integers        
        token_embeddings = self.token_embedding_table(index) # (Batch, Time, Channel)
        position_embeddings = self.position_embedding_table(torch.arange(T, device=device)) # (Time, Channel)
        x = token_embeddings + position_embeddings # (Batch, Time, Channel)
        x = self.blocks(x) # (Batch, Time, Channel)
        x = self.layer_norm_final(x) # (Batch, Time, Channel)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            index_condition = index[:, -block_size:]
            # Get predictions
            logits, loss = self(index_condition)
            # Focus only on the last token (timestep)
            logits = logits[:, -1, :] # (Batch, Vocab)
            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (Batch, Vocab)
            # Sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (Batch, 1)
            # Append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (Batch, Time + 1)
        return index

bigram_lm = BigramLanguageModel()
m = bigram_lm.to(device)

In [287]:
optimizer = torch.optim.Adam(bigram_lm.parameters(), lr=learning_rate)

batch_size = 32
for iter in range(max_epochs):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"Epoch: {iter}, Train loss: {losses['train']:.4f}, Val loss: {losses['val']:.4f}")

    # sampel a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    token_embeddings, loss = bigram_lm(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

Epoch: 0, Train loss: 4.3794, Val loss: 4.3774
Epoch: 300, Train loss: 2.5708, Val loss: 2.5521
Epoch: 600, Train loss: 2.4070, Val loss: 2.4186
Epoch: 900, Train loss: 2.3350, Val loss: 2.3406
Epoch: 1200, Train loss: 2.2836, Val loss: 2.2915
Epoch: 1500, Train loss: 2.2479, Val loss: 2.2611
Epoch: 1800, Train loss: 2.1998, Val loss: 2.2313
Epoch: 2100, Train loss: 2.1766, Val loss: 2.2058
Epoch: 2400, Train loss: 2.1659, Val loss: 2.1773
Epoch: 2700, Train loss: 2.1450, Val loss: 2.1787
Epoch: 3000, Train loss: 2.1312, Val loss: 2.1511
Epoch: 3300, Train loss: 2.1103, Val loss: 2.1438
Epoch: 3600, Train loss: 2.0991, Val loss: 2.1455
Epoch: 3900, Train loss: 2.0886, Val loss: 2.1307
Epoch: 4200, Train loss: 2.0853, Val loss: 2.1144
Epoch: 4500, Train loss: 2.0705, Val loss: 2.1106
Epoch: 4800, Train loss: 2.0542, Val loss: 2.1049


In [288]:
context = torch.zeros((1, 1), dtype=torch.long).to(device)
print(decode(bigram_lm.generate(context, max_new_tokens=500)[0].tolist()))


And the shade praysig
May am kingshat as dikes monk uld Rdowzecue mut well'd dele,

Gown, afted
As ren unseme thath knomy good for blided kneart nown od,
Thy hat kny,
Frims and bucdeminn untuanteend, suep sapas shizt all kno,
Forst;
Wher thau dett I fall Jasplifence,
Triruege of fling pack.

KIVN IVE- ba yor with non arnayascen
Friceant,
Yelears son young wofs,
What beain Gurman, shou, dace tom; and and ame!
I'llove amioce; fror:
Of dands wotrngtherick,
By beftackne
What have adraigh browy anibl


# The mathematical trick in self-attention

In [92]:
torch.manual_seed(1337) 
B,T,C  = 4, 8, 2 # Batch size, Time, Channels
x = torch.randn(B, T, C)
x.shape

# bow = bag of words
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1,:] # (t, C)
        xbow[b,t] = torch.mean(xprev, dim=0)


In [95]:
print(x[0])
print(xbow[0])

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


In [104]:
# version 2
weigths = torch.tril(torch.ones(T, T))
weigths = weigths / weigths.sum(1, keepdim=True)
weigths
xbow2 = weigths @ x # (B, T, T) @ (B, T, C) -> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [105]:
# version 3
tril = torch.tril(torch.ones(T, T))
weigths = torch.zeros((T,T))
weigths = weigths.masked_fill(tril == 0, float('-inf'))
weigths = F.softmax(weigths, dim=1)
xbow3 = weigths @ x
torch.allclose(xbow, xbow3)

True

In [139]:
# Version 4: seld-attention!
torch.manual_seed(1337)
B,T,C = 4, 8, 32 # Batch, Time, Channels
x = torch.randn(B, T, C)

# Let's see a single head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)

weigths = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# weigths = torch.zeros((T,T))
weigths = weigths.masked_fill(tril == 0, float('-inf'))
weigths = F.softmax(weigths, dim=-1)
out = weigths @ x

v = value(x)
out = weigths @ v
out.shape

torch.Size([4, 8, 16])

In [97]:
# Tril stands for triangle lower
torch.tril(torch.ones(3,3,))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [152]:
torch.manual_seed(42)
#a  = torch.ones(3,3)
#a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, dim=1, keepdim=True)
b = torch.randint(0,10, (3,2)).float()
c = a @ b
print(f"a= {a}")
print('--')
print(f"b= {b}")
print('--')
print(f"c= {c}")
print('--')

a= tensor([[0.1667, 0.3333, 0.5000],
        [0.2667, 0.3333, 0.4000]])
--
b= tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c= tensor([[5.3333, 5.0000],
        [4.9333, 5.2000]])
--


In [153]:
# Examples of why we need scaled attention
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5 

In [154]:
print(k.var())
print(q.var())
print(wei.var())

tensor(0.9331)
tensor(0.8879)
tensor(0.8150)


In [155]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim= -1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [156]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim= -1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

# Exploration of Broadcasting in pytorch

Example without broadcasting:

In [126]:
# Creating a tensor of shape (2, 3) - imagine this as two sets of 3-channel embeddings
a = torch.tensor([[1, 2, 3], 
                  [4, 5, 6]])

# Creating another tensor of shape (2, 3) - similar structure as 'a'
b = torch.tensor([[1, 1, 1], 
                  [2, 2, 2]])

# Direct addition, no broadcasting needed as shapes are identical
result = a + b
print(result)


tensor([[2, 3, 4],
        [6, 7, 8]])


Example with broadcasting:

In [127]:
# Creating a tensor of shape (2, 3) - imagine this as two sets of 3-channel embeddings
a = torch.tensor([[1, 2, 3], 
                  [4, 5, 6]])

# Creating a tensor of shape (3,) - a single 3-channel embedding
b = torch.tensor([1, 1, 1])

# Addition with broadcasting: 'b' is automatically expanded to match the shape of 'a'
result = a + b
print(result)


tensor([[2, 3, 4],
        [5, 6, 7]])
