<a href="https://colab.research.google.com/github/heerboi/AI-from-scratch/blob/main/gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Following Andrej's video: https://www.youtube.com/watch?v=kCc8FmEb1nY

In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/refs/heads/master/data/tinyshakespeare/input.txt

--2025-08-14 09:59:48--  https://raw.githubusercontent.com/karpathy/char-rnn/refs/heads/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-08-14 09:59:48 (22.6 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
with open('input.txt', 'r', encoding="utf-8") as f:
    text = f.read()

In [3]:
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

print(encode("Hii"))
print(decode(encode("Hii")))

[20, 47, 47]
Hii


In [6]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [7]:
split = int(0.9*len(data))
train_data = data[:split]
val_data = data[split:]
print(len(train_data))
print(len(val_data))

1003854
111540


In [8]:
#context length

block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(context, target)

tensor([18]) tensor(47)
tensor([18, 47]) tensor(56)
tensor([18, 47, 56]) tensor(57)
tensor([18, 47, 56, 57]) tensor(58)
tensor([18, 47, 56, 57, 58]) tensor(1)
tensor([18, 47, 56, 57, 58,  1]) tensor(15)
tensor([18, 47, 56, 57, 58,  1, 15]) tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor(58)


In [10]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):

    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])

    return x, y

x, y = get_batch("train")
print(x)
print(y)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(num_embeddings = vocab_size, embedding_dim = vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape

            logits = logits.view(B*T, C)
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            # last time step for each batch and include all embeddings
            logits = logits[:, -1, :]

            probabilities = F.softmax(logits, dim=1)
            # (B, 1)
            next_idx = torch.multinomial(probabilities, num_samples=1)
            # (B, T+1)
            idx = torch.cat((idx, next_idx), dim=1)
        return idx

m = BigramLanguageModel(vocab_size)
out, loss = m(x, y)
print(out.shape)
print(out)

print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor([[ 1.6347, -0.0518,  0.4996,  ...,  0.2432,  1.1519,  0.9950],
        [ 0.3418, -0.9276,  1.2381,  ...,  1.5018, -0.5266,  0.2354],
        [ 0.1479, -0.4333,  0.5203,  ...,  0.3302,  1.5454,  1.3778],
        ...,
        [-0.5693, -0.0735,  0.7743,  ..., -0.0815, -1.1445, -0.0623],
        [ 0.4658, -0.2573, -1.0673,  ...,  1.2439,  1.3471,  1.6910],
        [-0.4553,  0.0139,  0.9309,  ...,  0.0290, -0.7568,  0.8701]],
       grad_fn=<ViewBackward0>)

l-QYjt'CL?jLDuQcLzy'RIo;'KdhpV
vLixa,nswYZwLEPS'ptIZqOZJ$CA$zy-QTkeMk x.gQSFCLg!iW3fO!3DGXAqTsq3pdgq


In [12]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [13]:
batch_size = 32

for steps in range(5000):
    xb,yb = get_batch('train')

    logits, loss = m(xb,yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5904765129089355


In [14]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))


AnzLLOjad ws amangano s nd TINTI zXave anlfedo cor be:ghere
D:
IO, n!qXDYd inks cerVis benes'sowr wnof pre, hairal.
Tir W:
Ano u
Bue;BOgfatho rend thich,
The tTorak,

D:
GHisok R.
Jeveauee! ce,
OMA::'
t.
HELEThRird maseestindovive wat boue nodgh flle, mOf 

SCarXto An twl;thind d me sesete gksow'DX'llaronte

HEThecarmoar, file to he bis wllagGr O g.oamacl off nrop'twindg,
WS trnen t mllel $Lupzy iBOFL$KENor-fo ed dONus s are th, al ent I s l:
E-HEThond
Tge henJe long bo ave ttheKar f : t
Y;-fe m blqxp;t;the.Ren illd b.'t Qght tthpe,
CEN:
FLOfaind Rl, An, WS:OUk,e m?

OLy iow yous
Youtiveas t fye tea;
Fgs kGPMllort'Y:
Bexan t ber is e tle?Ty osWDothou d a atororetheang nS:
ARCAses inowifowe
'thin ICA,
CCFlformymsas twhatl fo the y wey youe.
O:urmerzio, or m, stlly il, illled tthereSAnce cesed me n.
TfaRGLO:
Son tyJee garavensty d .
K:um&zXENTESers ip'le,Of s mave, pl as feARS:HENINC pes:
S:
GI tearor-aks de
FRO, woCIs Yeaitst h lldok h g br t 'thenaroled.Bo'-ce pt, ve!
Rak theto ty ond

In [15]:
eval_iters = 200
@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb = get_batch(split)
            logits, loss = m(xb, yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out
estimate_loss()

{'train': tensor(2.5633), 'val': tensor(2.5757)}

## Mathematical trick in self-attention!

- have to average the logits in the time dim 0..t for logit t


In [16]:
B, T, C = 4, 8, 2
x = torch.randn(B,T,C)

In [17]:
div = torch.tril(torch.ones(T,T))
div /= div.sum(dim=1, keepdim=True)
xbow = div @ x

In [18]:
div

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [19]:
x[0], xbow[0]

(tensor([[ 0.5438,  0.8303],
         [ 0.2363,  0.3804],
         [ 0.2687,  1.9010],
         [ 2.5304, -0.8176],
         [-0.8352, -0.0982],
         [-0.3931, -0.8298],
         [-0.8375,  0.8008],
         [-0.2700,  0.7297]]),
 tensor([[0.5438, 0.8303],
         [0.3901, 0.6053],
         [0.3496, 1.0372],
         [0.8948, 0.5735],
         [0.5488, 0.4392],
         [0.3918, 0.2277],
         [0.2162, 0.3096],
         [0.1554, 0.3621]]))

### using softmax(infinity)

hint: e^-infinity = 0, and e^0 = 1

In [20]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei,dim=1)
xbow3 = wei @ x
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

## A bit about attention

- Attention is just a mechanism that adds a set of values with a set of weights. The approach above takes the weights to be equally distributed for the node itself and the nodes before, and zero for all nodes after.

- But, the current node might find more of what it needs from some nodes rather than others; it won't necessarily be equally distributed.

- Paper proposes an attention function where each node (token) at time T emits a query vector that contains the information that the current node is looking for, and a key vector that contains the information that the current node has within itself.

- This query vector and key vector get multiplied together to get the "affinities" between what the nodes are looking for and what the nodes have (T, T dimension, so each combination)

- Instead of taking the average of each node, we perform softmax on this new matrix. Now, instead of multiplying the "original" values $x$, we multiply it with the "value" matrix, which is different for each attention "head"

- As each head has a different purpose, it will have a different value to emit in each head, a different value that it posesses that makes more sense for that particular head!

In [21]:
head_size = 16
Q = nn.Linear(C, head_size, bias=False)
K = nn.Linear(C, head_size, bias=False)
V = nn.Linear(C, head_size, bias=False)

queries = Q(x)
keys = K(x)

print(queries.shape)
print(keys.shape)

torch.Size([4, 8, 16])
torch.Size([4, 8, 16])


In [22]:
T

8

In [23]:
tril = torch.tril(torch.ones(T, T))
wei = keys @ queries.transpose(-2, -1) # (4, 8, 8)
# wei = torch.zeros((T, T))
wei1 = wei.masked_fill(tril==0, float('-inf'))
wei1 = F.softmax(wei1, dim=1)
wei = F.softmax(wei,dim=1)

values = V(x)

xbow4 = wei @ values
xbow5 = wei1 @ values
print(wei.shape)
print(xbow4.shape)

torch.Size([4, 8, 8])
torch.Size([4, 8, 16])


In [24]:
wei[0], xbow4[0]

(tensor([[1.5395e-01, 1.4421e-01, 1.1343e-01, 9.5902e-02, 4.4569e-02, 8.0656e-02,
          4.1523e-02, 9.1523e-02],
         [1.0330e-01, 1.2122e-01, 9.1733e-02, 1.5713e-02, 8.1273e-02, 1.0787e-01,
          7.5153e-02, 1.1028e-01],
         [3.4110e-01, 2.0520e-01, 2.8693e-01, 7.2422e-01, 2.0038e-02, 4.1794e-02,
          2.5723e-02, 9.2891e-02],
         [5.7472e-02, 9.2028e-02, 1.4103e-02, 1.4784e-01, 5.7326e-02, 2.1030e-01,
          2.0382e-02, 4.3891e-02],
         [6.0308e-02, 9.6383e-02, 1.0005e-01, 4.2413e-04, 2.4542e-01, 1.5076e-01,
          2.8161e-01, 1.8877e-01],
         [3.6409e-02, 7.6905e-02, 4.7700e-02, 1.8961e-04, 3.6141e-01, 2.3420e-01,
          3.0864e-01, 1.6644e-01],
         [1.2180e-01, 1.3140e-01, 1.9813e-01, 3.8900e-03, 1.0870e-01, 8.6129e-02,
          1.5240e-01, 1.7223e-01],
         [1.2566e-01, 1.3266e-01, 1.4792e-01, 1.1817e-02, 8.1271e-02, 8.8295e-02,
          9.4568e-02, 1.3398e-01]], grad_fn=<SelectBackward0>),
 tensor([[-1.2461e-01, -2.5957e-01,

In [25]:
wei1[0], xbow5[0]

(tensor([[0.1540, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1033, 0.1417, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3411, 0.2398, 0.3610, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0575, 0.1075, 0.0177, 0.9006, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0603, 0.1126, 0.1259, 0.0026, 0.3080, 0.0000, 0.0000, 0.0000],
         [0.0364, 0.0899, 0.0600, 0.0012, 0.4536, 0.5731, 0.0000, 0.0000],
         [0.1218, 0.1535, 0.2493, 0.0237, 0.1364, 0.2108, 0.6171, 0.0000],
         [0.1257, 0.1550, 0.1861, 0.0720, 0.1020, 0.2161, 0.3829, 1.0000]],
        grad_fn=<SelectBackward0>),
 tensor([[-4.8035e-02, -8.7536e-02,  1.0568e-01, -2.7354e-02,  8.6838e-03,
          -2.7992e-03,  7.8406e-02,  7.3124e-02,  4.4261e-02, -3.8028e-02,
           3.9561e-02,  3.8676e-02,  7.9456e-02,  5.8628e-02,  2.5569e-02,
          -1.5935e-02],
         [-5.2818e-02, -9.4532e-02,  1.1509e-01, -2.8652e-02,  9.7027e-03,
          -3.0309e-03,  8.5020e-02,  7.

there's a little problem tho

In [26]:
query = torch.randn((4, 8, 16))
key = torch.randn((4, 8, 16))

print(query.var())
print(key.var())

tensor(0.9244)
tensor(0.8984)


In [27]:
qk = key @ query.transpose(-2, -1)
print(qk.var())

tensor(12.8625)


HUGE difference in variance, and when variance is high, means the difference between the values is huge. Since we'll apply softmax on this, if the values are very imbalanced, there'll be a huge imbalance in the weight assigned to other nodes, esp when the network is still untrained.

The paper proposes dividing the multiplication by the square root of head size, let's try it.

In [28]:
qk = key @ query.transpose(-2, -1) * head_size**-0.5
print(qk.var())

tensor(0.8039)


looks good

In [94]:
# num of attn heads running in parallel
n_heads = 8
# embedding size
# all layer final outputs must match 256
n_embd = 256

# individual heads are concat at the end
head_size = n_embd // n_heads

# size of ffn hidden layer
hidden_size = 512

# total number of stacked transformer blocks
n_blocks = 4

In [95]:
class SingleAttentionHead(nn.Module):

    def __init__(self):
        super().__init__()
        self.Q = nn.Linear(n_embd, head_size, bias=False)
        self.K = nn.Linear(n_embd, head_size, bias=False)
        self.V = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        # (B, T, head_size)
        queries = self.Q(x)
        keys = self.K(x)
        values = self.V(x)

        wei = keys @ queries.transpose(-2, -1) * head_size ** -0.5
        wei = wei.masked_fill(self.tril == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        x = wei @ values
        return x

class FFN(nn.Module):

    def __init__(self, in_features, out_features, bias=True):
        super().__init__()
        self.layers = nn.Sequential(
            # op: (B, T, hidden_size)
            nn.Linear(in_features, hidden_size, bias=bias),
            nn.ReLU(),
            # op: (B, T, n_embd)
            nn.Linear(hidden_size, out_features, bias=bias),
            nn.LayerNorm(out_features)
        )

    def forward(self, x):
        out = x + self.layers(x)
        return out

In [97]:
class MultiAttentionBlock(nn.Module):

    def __init__(self):
        super().__init__()
        self.heads = [SingleAttentionHead() for _ in range(n_heads)]

        self.linear = nn.Linear(n_embd, n_embd)

        self.layer_norm = nn.LayerNorm(n_embd)

    def forward(self, x):
        # each op: (B, T, head_size)
        act = [head(x) for head in self.heads]
        # op: (B, T, n_embd)
        out = x + self.layer_norm(self.linear(torch.concat(act, dim=-1)))

        return out

class DecoderBlock(nn.Module):
    def __init__(self):
        super().__init__()

        self.multi_attention_block = MultiAttentionBlock()
        self.ffn = FFN(n_embd, n_embd)

    def forward(self, x):
        x = self.ffn(self.multi_attention_block(x))

        return x

In [98]:
class Decoder(nn.Module):

    def __init__(self):
        super().__init__()

        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
        self.position_embedding_table = nn.Embedding(num_embeddings=block_size, embedding_dim=n_embd)
        # self.lm_head = SingleAttentionHead(head_size)
        # self.ffn = FFN(head_size, hidden_size)
        # self.attention_block = SingleAttentionBlock(head_size, hidden_size)

        # inp: (B, T, n_embd)
        # op:  (B, T, n_embd)
        # self.multi_head_attn = MultiAttentionBlock()

        # self.ffn = FFN(n_embd, n_embd)
        # pairs of multi head self attn blocks + ffn in sequence
        self.decoder_block = nn.Sequential(*[DecoderBlock() for _ in range(n_blocks)])

        self.nn = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_embd = self.token_embedding_table(idx)
        pos_embd = self.position_embedding_table(torch.arange(T))

        x = tok_embd + pos_embd
        # x = self.lm_head(x)
        # x = self.ffn(x)
        # x = self.attention_block(x)
        # residual connections moved to their respective classes
        x = self.decoder_block(x)

        logits = self.nn(x)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape

            logits = logits.view(B*T, C)
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # pick only last 8 tokens for next token
            idx_next = idx[:, -block_size:]
            logits, loss = self(idx_next)
            # last time step for each batch and include all embeddings
            logits = logits[:, -1, :]

            probabilities = F.softmax(logits, dim=1)
            # (B, 1)
            next_idx = torch.multinomial(probabilities, num_samples=1)
            # (B, T+1)
            idx = torch.cat((idx, next_idx), dim=1)
        return idx

In [99]:
xb, yb = get_batch('train')

In [100]:
m = Decoder()
out, loss = m(xb, yb)
print(out.shape)
print(out)
print("Total parameters:")
print(sum([p.nelement() for p in m.parameters()]))

print(decode(m.generate(torch.zeros((1,8), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([2048, 65])
tensor([[-8.7318e-01, -1.9181e-01,  6.0401e-01,  ..., -1.0065e+00,
         -9.5612e-01,  1.2474e+00],
        [-1.4207e+00,  1.5129e+00, -1.2188e-01,  ..., -2.2735e-01,
         -8.6706e-02,  1.9157e+00],
        [-1.8719e+00,  8.7167e-01,  6.2719e-01,  ..., -6.3764e-01,
         -2.3367e-01,  1.9278e+00],
        ...,
        [-1.5966e+00, -2.4771e-01,  2.4216e+00,  ...,  7.9779e-01,
          2.2453e+00, -6.0235e-01],
        [-1.9299e+00,  1.1449e+00,  1.4775e+00,  ..., -1.7362e+00,
         -1.7333e+00,  2.9571e-01],
        [-1.2941e+00,  3.1306e+00,  2.6825e+00,  ...,  9.7182e-01,
         -1.1089e-03,  1.2258e+00]], grad_fn=<ViewBackward0>)
Total parameters:
1354305








Do::y?WSWodS:3,,KjjwapN ,yXj:3'o,RlDo3h,&oXphzA cjMroFuoaASy!ovyiUqcXAUWT!aEKMq;;AYzKDAAFjPykS:ooi3S


In [101]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [102]:
batch_size = 512

for steps in range(2000):
    xb,yb = get_batch('train')

    logits, loss = m(xb,yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % 100 == 0:
        print(f"Loss at {steps}: {loss.item()}")

Loss at 0: 5.695481777191162
Loss at 100: 2.362318992614746
Loss at 200: 2.1631181240081787
Loss at 300: 2.0405404567718506
Loss at 400: 1.9610567092895508
Loss at 500: 1.9178212881088257
Loss at 600: 1.8719313144683838
Loss at 700: 1.8670819997787476
Loss at 800: 1.8232440948486328
Loss at 900: 1.831138253211975
Loss at 1000: 1.774194359779358
Loss at 1100: 1.7129796743392944
Loss at 1200: 1.7640798091888428
Loss at 1300: 1.747159719467163
Loss at 1400: 1.7421679496765137
Loss at 1500: 1.7068963050842285
Loss at 1600: 1.7112239599227905
Loss at 1700: 1.7106486558914185
Loss at 1800: 1.6543344259262085
Loss at 1900: 1.6784592866897583


training a bit longer bec loss still decreasing

In [104]:
batch_size = 512

for steps in range(500):
    xb,yb = get_batch('train')

    logits, loss = m(xb,yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % 100 == 0:
        print(f"Loss at {steps}: {loss.item()}")

Loss at 0: 1.6687389612197876
Loss at 100: 1.7017985582351685
Loss at 200: 1.6481417417526245
Loss at 300: 1.6599197387695312
Loss at 400: 1.682623267173767


In [105]:
estimate_loss()

{'train': tensor(1.6589), 'val': tensor(1.8294)}

* transformer with single attention block, no layer norm, and no ffn: train 2.3442 val: 2.3719
* transformer with single attn, layer norm and ffn, no residual connection: train 2.2628 val 2.3023

* transformer with multi head attn + linear, layer norm, ffn and residual connection in after multihead attn: train 1.7584 val 1.9072

* transformer with multiple stacked attention-ffn blocks!: train 1.65 val 1.82

In [107]:
print(decode(m.generate(torch.zeros((1,8), dtype=torch.long), max_new_tokens=1000)[0].tolist())[8:])

LEOMENIUS:
No, gracrow death;
But not,
How the
Vollant.

DUKE VINCENTIO:
The consumber: to thou laws?

LUCIO:
And in make they day,
When entreal of neck by they life her foothes. Do his slaughter;' much as never jumb
And be in famish accup, a buried
Fouth, his camforrows.

ANGELO:
When then I had to thy tomb.
Your love
A graves late, you are the Edwarah? Comen, and he been be:
My sheer!

CAMILLIUS:
Intent on and pass
By intell not me of
Leave in but the raides stolood this buddsmines
Behear my darest chill-wer child with him strain warply ears me faults
Thy lord cord.

QUEEN ELIZABET:
Thee work offirst
Senate, and Hermone everdience of this
daughterous and hath time
Therefore, and witngrace,
I was saido.

LUCIO:
Why, must three: all they lies shallow--ground then disiners that
Why, thousand there admy father
that blusk Fherce of the would manners for his I talkest of shall perhap, grief love
My pragme,
To do? I had, I am of them I was, the kindly. Less the faults, hath news.
For you ou