<a href="https://colab.research.google.com/github/heerboi/AI-from-scratch/blob/main/gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Following Andrej's video: https://www.youtube.com/watch?v=kCc8FmEb1nY

In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/refs/heads/master/data/tinyshakespeare/input.txt

--2025-07-30 10:33:05--  https://raw.githubusercontent.com/karpathy/char-rnn/refs/heads/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-07-30 10:33:05 (22.0 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
with open('input.txt', 'r', encoding="utf-8") as f:
    text = f.read()

In [3]:
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

print(encode("Hii"))
print(decode(encode("Hii")))

[20, 47, 47]
Hii


In [6]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [7]:
split = int(0.9*len(data))
train_data = data[:split]
val_data = data[split:]
print(len(train_data))
print(len(val_data))

1003854
111540


In [8]:
#context length

block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(context, target)

tensor([18]) tensor(47)
tensor([18, 47]) tensor(56)
tensor([18, 47, 56]) tensor(57)
tensor([18, 47, 56, 57]) tensor(58)
tensor([18, 47, 56, 57, 58]) tensor(1)
tensor([18, 47, 56, 57, 58,  1]) tensor(15)
tensor([18, 47, 56, 57, 58,  1, 15]) tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor(58)


In [10]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):

    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])

    return x, y

x, y = get_batch("train")
print(x)
print(y)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(num_embeddings = vocab_size, embedding_dim = vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape

            logits = logits.view(B*T, C)
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            # last time step for each batch and include all embeddings
            logits = logits[:, -1, :]

            probabilities = F.softmax(logits, dim=1)
            # (B, 1)
            next_idx = torch.multinomial(probabilities, num_samples=1)
            # (B, T+1)
            idx = torch.cat((idx, next_idx), dim=1)
        return idx

m = BigramLanguageModel(vocab_size)
out, loss = m(x, y)
print(out.shape)
print(out)

print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor([[ 1.6347, -0.0518,  0.4996,  ...,  0.2432,  1.1519,  0.9950],
        [ 0.3418, -0.9276,  1.2381,  ...,  1.5018, -0.5266,  0.2354],
        [ 0.1479, -0.4333,  0.5203,  ...,  0.3302,  1.5454,  1.3778],
        ...,
        [-0.5693, -0.0735,  0.7743,  ..., -0.0815, -1.1445, -0.0623],
        [ 0.4658, -0.2573, -1.0673,  ...,  1.2439,  1.3471,  1.6910],
        [-0.4553,  0.0139,  0.9309,  ...,  0.0290, -0.7568,  0.8701]],
       grad_fn=<ViewBackward0>)

l-QYjt'CL?jLDuQcLzy'RIo;'KdhpV
vLixa,nswYZwLEPS'ptIZqOZJ$CA$zy-QTkeMk x.gQSFCLg!iW3fO!3DGXAqTsq3pdgq


In [12]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [13]:
batch_size = 32

for steps in range(5000):
    xb,yb = get_batch('train')

    logits, loss = m(xb,yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5904765129089355


In [14]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))


AnzLLOjad ws amangano s nd TINTI zXave anlfedo cor be:ghere
D:
IO, n!qXDYd inks cerVis benes'sowr wnof pre, hairal.
Tir W:
Ano u
Bue;BOgfatho rend thich,
The tTorak,

D:
GHisok R.
Jeveauee! ce,
OMA::'
t.
HELEThRird maseestindovive wat boue nodgh flle, mOf 

SCarXto An twl;thind d me sesete gksow'DX'llaronte

HEThecarmoar, file to he bis wllagGr O g.oamacl off nrop'twindg,
WS trnen t mllel $Lupzy iBOFL$KENor-fo ed dONus s are th, al ent I s l:
E-HEThond
Tge henJe long bo ave ttheKar f : t
Y;-fe m blqxp;t;the.Ren illd b.'t Qght tthpe,
CEN:
FLOfaind Rl, An, WS:OUk,e m?

OLy iow yous
Youtiveas t fye tea;
Fgs kGPMllort'Y:
Bexan t ber is e tle?Ty osWDothou d a atororetheang nS:
ARCAses inowifowe
'thin ICA,
CCFlformymsas twhatl fo the y wey youe.
O:urmerzio, or m, stlly il, illled tthereSAnce cesed me n.
TfaRGLO:
Son tyJee garavensty d .
K:um&zXENTESers ip'le,Of s mave, pl as feARS:HENINC pes:
S:
GI tearor-aks de
FRO, woCIs Yeaitst h lldok h g br t 'thenaroled.Bo'-ce pt, ve!
Rak theto ty ond

In [15]:
eval_iters = 200
@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb = get_batch(split)
            logits, loss = m(xb, yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out
estimate_loss()

{'train': tensor(2.5633), 'val': tensor(2.5757)}

## Mathematical trick in self-attention!

- have to average the logits in the time dim 0..t for logit t


In [16]:
B, T, C = 4, 8, 2
x = torch.randn(B,T,C)

In [None]:
div = torch.tril(torch.ones(T,T))
div /= div.sum(dim=1, keepdim=True)
xbow = div @ x

In [None]:
div

In [None]:
x[0], xbow[0]

### using softmax(infinity)

hint: e^-infinity = 0, and e^0 = 1

In [None]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei,dim=1)
xbow3 = wei @ x
wei

## A bit about attention

- Attention is just a mechanism that adds a set of values with a set of weights. The approach above takes the weights to be equally distributed for the node itself and the nodes before, and zero for all nodes after.

- But, the current node might find more of what it needs from some nodes rather than others; it won't necessarily be equally distributed.

- Paper proposes an attention function where each node (token) at time T emits a query vector that contains the information that the current node is looking for, and a key vector that contains the information that the current node has within itself.

- This query vector and key vector get multiplied together to get the "affinities" between what the nodes are looking for and what the nodes have (T, T dimension, so each combination)

- Instead of taking the average of each node, we perform softmax on this new matrix. Now, instead of multiplying the "original" values $x$, we multiply it with the "value" matrix, which is different for each attention "head"

- As each head has a different purpose, it will have a different value to emit in each head, a different value that it posesses that makes more sense for that particular head!

In [23]:
head_size = 16
Q = nn.Linear(C, head_size, bias=False)
K = nn.Linear(C, head_size, bias=False)
V = nn.Linear(C, head_size, bias=False)

queries = Q(x)
keys = K(x)

print(queries.shape)
print(keys.shape)

torch.Size([4, 8, 16])
torch.Size([4, 8, 16])


In [30]:
T

8

In [24]:
tril = torch.tril(torch.ones(T, T))
wei = keys @ queries.transpose(-2, -1) # (4, 8, 8)
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei,dim=1)

values = V(x)

xbow4 = wei @ values
print(wei.shape)
print(xbow4.shape)

torch.Size([4, 8, 8])
torch.Size([4, 8, 16])


In [25]:
wei[0]

tensor([[0.1540, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1207, 0.1454, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1950, 0.1820, 0.3972, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1833, 0.1776, 0.4644, 0.0035, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0721, 0.1140, 0.0164, 0.3823, 0.2233, 0.0000, 0.0000, 0.0000],
        [0.0670, 0.1103, 0.0140, 0.3956, 0.2275, 0.4462, 0.0000, 0.0000],
        [0.0950, 0.1298, 0.0380, 0.1529, 0.2542, 0.3053, 0.3855, 0.0000],
        [0.1130, 0.1409, 0.0699, 0.0657, 0.2950, 0.2485, 0.6145, 1.0000]],
       grad_fn=<SelectBackward0>)

there's a little problem tho

In [27]:
query = torch.randn((4, 8, 16))
key = torch.randn((4, 8, 16))

print(query.var())
print(key.var())

tensor(1.0283)
tensor(0.9676)


In [28]:
qk = key @ query.transpose(-2, -1)
print(qk.var())

tensor(14.4435)


HUGE difference in variance, and when variance is high, means the difference between the values is huge. Since we'll apply softmax on this, if the values are very imbalanced, there'll be a huge imbalance in the weight assigned to other nodes, esp when the network is still untrained.

The paper proposes dividing the multiplication by the square root of head size, let's try it.

In [29]:
qk = key @ query.transpose(-2, -1) * head_size**-0.5
print(qk.var())

tensor(0.9027)


looks good

In [31]:
head_size = 16
n_embd = 32

In [61]:
class SingleAttentionHead(nn.Module):

    def __init__(self, head_size):
        super().__init__()

        self.Q = nn.Linear(n_embd, head_size, bias=False)
        self.K = nn.Linear(n_embd, head_size, bias=False)
        self.V = nn.Linear(n_embd, head_size, bias=False)

        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        # (B, T, head_size)
        queries = self.Q(x)
        keys = self.K(x)
        values = self.V(x)

        wei = keys @ queries.transpose(-2, -1) * head_size ** -0.5
        wei = wei.masked_fill(self.tril == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        x = wei @ values
        return x

In [62]:
class Decoder(nn.Module):

    def __init__(self):
        super().__init__()

        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
        self.position_embedding_table = nn.Embedding(num_embeddings=block_size, embedding_dim=n_embd)
        self.lm_head = SingleAttentionHead(head_size)
        self.nn = nn.Linear(head_size, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        embeds = self.token_embedding_table(idx)
        pos_embd = self.position_embedding_table(torch.arange(T))

        x = embeds + pos_embd
        x = self.lm_head(x)

        logits = self.nn(x)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape

            logits = logits.view(B*T, C)
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # pick only last 8 tokens for next token
            idx_next = idx[:, -block_size:]
            logits, loss = self(idx_next)
            # last time step for each batch and include all embeddings
            logits = logits[:, -1, :]

            probabilities = F.softmax(logits, dim=1)
            # (B, 1)
            next_idx = torch.multinomial(probabilities, num_samples=1)
            # (B, T+1)
            idx = torch.cat((idx, next_idx), dim=1)
        return idx

In [63]:
xb, yb = get_batch('train')

In [64]:
m = Decoder()
out, loss = m(xb, yb)
print(out.shape)
print(out)

print(decode(m.generate(torch.zeros((1,8), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([4096, 65])
tensor([[ 0.0181, -0.1023,  0.8002,  ..., -0.2845, -0.4061,  0.2571],
        [-0.0325, -0.1428,  0.5561,  ..., -0.4184, -0.0719,  0.2323],
        [ 0.0309,  0.1056,  0.3253,  ..., -0.4498,  0.1294,  0.1672],
        ...,
        [-0.0928,  0.5668,  0.2997,  ..., -0.1537, -0.6549, -0.3078],
        [-0.2465,  0.2805,  0.2938,  ..., -0.1495, -0.5909, -0.4177],
        [-0.2500,  0.2215,  0.2390,  ..., -0.2054, -0.6350, -0.3636]],
       grad_fn=<ViewBackward0>)








MgfQ3ACMksXFk&mhDQWXBAr&uSNFmslpK.yCk$KJNoQ,l.h$i$COSQqJ,!b,TbIwkoHfm-FLP$ SIOpc$w?u'dClW?HJKc-fcO-a


In [65]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [67]:
batch_size = 512

for steps in range(5000):
    xb,yb = get_batch('train')

    logits, loss = m(xb,yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.2804243564605713


In [68]:
estimate_loss()

{'train': tensor(2.3247), 'val': tensor(2.3605)}

In [69]:
print(decode(m.generate(torch.zeros((1,8), dtype=torch.long), max_new_tokens=1000)[0].tolist()))









Bis: gturd yelou be arsse oud is cily fronokle for, theat, whar nend.

Th ithrs Proknou my, ire:
Pe,
Monotohargtht nod anken.

Wher swabpsed awe whe:
Thingeep, then lf bens th yover; Evemanif the, En scabins frovead ito burd;
We, quut I. I:
A I vyour wans thasin, ot
Herw awno to' cent, thed Lorce dece Cad beaivelyot Pudr hye ho hee der, ly.
An:
Sugr tut winat il.

LARESS:
O yof thelr durnontcet to se friat iffo hert--
WAUnd masius thaljotrousereyn, tan 'Wcienliscus do fof bithean Wlie'
d pom, adrdy allllatirt; dw eacedlos Rcexeeses omowrr fl thin, rd:
Norpe sa nd ach Per'

Thon owuls hy it peat ancod lisen mol oroon to hin ve,
Frirener, hith athingond praie inle hat icke st haverhorend Clol ntoonc'lld arben.

Yoth ato, the;
E:
Voen.

AN:
Beee ds tormin ise thond:
S omoul I icred th he os wans not os ong mwadu nowin-s I:
IOrd hou oflo oul bese we swsbey orinop.
I maghr ser piral-hannggen dr'e
Whe dwass.

Sor theun atllt anat Dithe it lfe whot lly thavy to vowau that mabe ame I:
