### OK, we've run out of space in names

let's start reading books

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
%run bookreader.py

In [5]:
alice = BookReader("alice.txt")
vocab_size = alice.vocab_size
vocab_size

40

### Get the batch with both x and y unseparated

In [4]:
def get_batch(data, batch_length=5, batch_size=5):
    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(len(data) - batch_length, (batch_size,))
    b = torch.stack([data[i:i+batch_length] for i in ix])
    return b

In [12]:
train = torch.tensor(names.data[0])

### Create an attention head

In [6]:
class Head(nn.Module):

    def __init__(self, c, head_size, content_length):
        super().__init__()
        self.key = nn.Linear(c, head_size, bias=False)
        self.query = nn.Linear(c, head_size, bias=False)
        self.value = nn.Linear(c, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(content_length, content_length)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   
        q = self.query(x)
        
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        
        v = self.value(x) # (B,T,hs)
        out = wei @ v
        return out

In [43]:
from collections import OrderedDict

class FeedForward(nn.Module):
    def __init__(self, fan_in, multiplier = 4):
        super().__init__()

        layers = OrderedDict([
            ("l_in", nn.Linear(fan_in, multiplier * fan_in)),
            ("relu", nn.ReLU()),
            ("l_out", nn.Linear(multiplier * fan_in, fan_in)),
        ])
        self.net = nn.Sequential(
            layers
        )

        initial = layers['l_in']
        nn.init.kaiming_normal_(initial.weight, nonlinearity="relu")
        layers['l_in'].weight.data = initial.weight.data * 3/5
        if initial.bias is not None:
            nn.init.constant_(initial.bias, 0)

        final = layers['l_out']
        layers['l_out'].weight.data = final.weight.data * .2
        if final.bias is not None:
            nn.init.constant_(final.bias, 0)

    def forward(self, x):
        return self.net(x)

In [60]:


class FFAttention(nn.Module):

    def __init__(self, embed_size, head_size, content_length):
        super().__init__()
        
        self.vocab_embed = nn.Embedding(vocab_size, embed_size)
        self.positional_embed = nn.Embedding(content_length, embed_size)
        self.attention = Head(embed_size, head_size, content_length)
        self.ff = FeedForward(head_size)
        self.decode = nn.Linear(head_size, vocab_size)
        self.content_length = content_length

    def forward(self, idx, targets=None):
        #idx B,T
        B, T = idx.shape

        idx_e = self.vocab_embed(idx)
        # note tr is always the same - so the learning here is information passed back to the positional_embed from loss
        tr = torch.arange(T)
        pos_e = self.positional_embed(tr)

        x = idx_e + pos_e
        x = self.attention(x)
        
        x = self.ff(x)
        
        logits = self.decode(x)

        if targets is None:
            loss = None
        else:
            loss = F.cross_entropy(logits.view(B*T, -1), targets.resize(B*T)) # loss function

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -self.content_length:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [67]:
import torch.optim as optim

In [65]:
epochs = 40
training_runs = 400
batch_size = 96
context_length = 4
learning_rate = .2
head_size = 8
embedding_dimensions = 16

model = FFAttention(embedding_dimensions, head_size, context_length)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # , weight_decay=args.weight_decay, betas=(0.9, 0.99), eps=1e-8)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print(epoch_loss/training_runs)
    

tensor(2.6897, grad_fn=<DivBackward0>)
tensor(2.4406, grad_fn=<DivBackward0>)
tensor(2.3813, grad_fn=<DivBackward0>)
tensor(2.3484, grad_fn=<DivBackward0>)
tensor(2.3330, grad_fn=<DivBackward0>)
tensor(2.3239, grad_fn=<DivBackward0>)
tensor(2.3134, grad_fn=<DivBackward0>)
tensor(2.2990, grad_fn=<DivBackward0>)
tensor(2.2913, grad_fn=<DivBackward0>)
tensor(2.2834, grad_fn=<DivBackward0>)
tensor(2.2776, grad_fn=<DivBackward0>)
tensor(2.2745, grad_fn=<DivBackward0>)
tensor(2.2713, grad_fn=<DivBackward0>)
tensor(2.2636, grad_fn=<DivBackward0>)
tensor(2.2631, grad_fn=<DivBackward0>)
tensor(2.2551, grad_fn=<DivBackward0>)
tensor(2.2523, grad_fn=<DivBackward0>)
tensor(2.2554, grad_fn=<DivBackward0>)
tensor(2.2477, grad_fn=<DivBackward0>)
tensor(2.2441, grad_fn=<DivBackward0>)
tensor(2.2467, grad_fn=<DivBackward0>)
tensor(2.2452, grad_fn=<DivBackward0>)
tensor(2.2434, grad_fn=<DivBackward0>)
tensor(2.2398, grad_fn=<DivBackward0>)
tensor(2.2427, grad_fn=<DivBackward0>)
tensor(2.2358, grad_fn=<D

In [66]:
idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(1):
    o = model.generate(idx, 100).data[0].tolist()
    print(names.decode(o))


pect abkl i theten ftly abeu gh  wae quurk rald   clurg pine vend bbit whor praa theed gchis all
ome


In [70]:
class MultiHead(nn.Module):

    def __init__(self, num_heads, head_size, embed_size, content_length):
        super().__init__()
        self.heads = nn.ModuleList([Head(embed_size, head_size, content_length) for _ in range(num_heads)])

    def forward(self, x):
        out = torch.cat( [head(x) for head in self.heads], dim = -1 )
        return out
    

In [71]:
class FFMultiHeadAttention(nn.Module):

    def __init__(self, embed_size, content_length, num_heads, head_size, multiplier=4):
        super().__init__()
        
        self.vocab_embed = nn.Embedding(vocab_size, embed_size)
        self.positional_embed = nn.Embedding(content_length, embed_size)
        self.mutli_attention = MultiHead(num_heads, head_size, embed_size, content_length)
        self.lna = nn.LayerNorm(embed_size)
        self.ff = FeedForward(embed_size, multiplier)
        self.lnff = nn.LayerNorm(embed_size)
        self.decode = nn.Linear(embed_size, vocab_size)
        self.content_length = content_length

    def forward(self, idx, targets=None):
        #idx B,T
        B, T = idx.shape

        idx_e = self.vocab_embed(idx)
        # note tr is always the same - so the learning here is information passed back to the positional_embed from loss
        tr = torch.arange(T)
        pos_e = self.positional_embed(tr)

        x = idx_e + pos_e
        x = self.mutli_attention(x)
        # print("multi ball out", x.shape)
        x = self.lna(x)
        x = self.ff(x)
        x = self.lnff(x)
        # print("feed forward out", x.shape)
        logits = self.decode(x)
        # print("decode out", x.shape)
        # print("targets", targets.shape)
        # return None, None

        if targets is None:
            loss = None
        else:
            targets = targets.reshape(B*T)
            loss = F.cross_entropy(logits.view(B*T, -1), targets) #.resize(B*T)) # loss function

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -self.content_length:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [72]:
epochs = 40
training_runs = 400
batch_size = 96
context_length = 4
learning_rate = .1
embedding_dimensions = 16
num_heads = 2
head_size = embedding_dimensions // num_heads

print(head_size)

model = FFMultiHeadAttention(embedding_dimensions, context_length, num_heads, head_size)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

lmbda = lambda epoch: 0.95

m_scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    print(ep, epoch_loss/training_runs, m_scheduler.get_last_lr())
    

8
0.004344 M parameters
0 tensor(2.4528, grad_fn=<DivBackward0>) [0.095]
1 tensor(2.2396, grad_fn=<DivBackward0>) [0.09025]
2 tensor(2.1978, grad_fn=<DivBackward0>) [0.0857375]
3 tensor(2.1731, grad_fn=<DivBackward0>) [0.08145062499999998]
4 tensor(2.1542, grad_fn=<DivBackward0>) [0.07737809374999999]
5 tensor(2.1246, grad_fn=<DivBackward0>) [0.07350918906249998]
6 tensor(2.1113, grad_fn=<DivBackward0>) [0.06983372960937498]
7 tensor(2.0980, grad_fn=<DivBackward0>) [0.06634204312890622]
8 tensor(2.0898, grad_fn=<DivBackward0>) [0.0630249409724609]
9 tensor(2.0836, grad_fn=<DivBackward0>) [0.05987369392383786]
10 tensor(2.0741, grad_fn=<DivBackward0>) [0.05688000922764597]
11 tensor(2.0757, grad_fn=<DivBackward0>) [0.05403600876626367]
12 tensor(2.0600, grad_fn=<DivBackward0>) [0.05133420832795048]
13 tensor(2.0632, grad_fn=<DivBackward0>) [0.04876749791155295]
14 tensor(2.0522, grad_fn=<DivBackward0>) [0.046329123015975304]
15 tensor(2.0541, grad_fn=<DivBackward0>) [0.04401266686517654

In [73]:
idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(1):
    o = model.generate(idx, 100).data[0].tolist()
    print(names.decode(o))


 said  

at othe never exane yokle as lou mout ? 
 her appp aged 


trabs and at  che you of the a c


In [74]:
def get_val_batch(data, batch_length=5, batch_size=5, i=0):
    # generate a small batch of data of inputs x and targets y
    if i == 0:
        ix = torch.randint(len(data) - batch_length, (batch_size,))
    else:
        ix = torch.arange(1, 5) + 1 + i

    b = torch.stack([data[i:i+batch_length] for i in ix])
    return b

print(get_val_batch(train, 5, 4, 5))
print(get_val_batch(train, 5, 4, 9))

get_batch(train, 5, 4)

tensor([[31,  1, 22,  1,  0],
        [ 1, 22,  1,  0, 17],
        [22,  1,  0, 17, 28],
        [ 1,  0, 17, 28, 36]])
tensor([[ 0, 17, 28, 36, 27],
        [17, 28, 36, 27,  1],
        [28, 36, 27,  1, 33],
        [36, 27,  1, 33, 21]])


tensor([[ 1, 14,  1, 32, 33],
        [22, 32,  1, 26, 28],
        [ 1, 15, 18,  0, 20],
        [21, 18, 14, 17,  1]])

In [75]:
import math
@torch.no_grad()
def split_loss(split):
    split_len = len(split)
    total_loss = 0
    batch_size = 50
    num_batches = math.floor(split_len / batch_size)
    print("num_batches", split_len, num_batches)
    
    model.eval()

    for i in range(num_batches):

        t_b = get_val_batch(split, context_length+1, batch_size, i*batch_size)
        
        x = t_b[:, 0: context_length]
        y = t_b[:, context_length: context_length+1]
        
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, batch_loss = model(x, y)
        
        total_loss = total_loss + batch_loss
    
    print("total loss", total_loss, total_loss / num_batches)

dev = torch.tensor(names.data[1])
print(split_loss(dev), len(dev))

num_batches 11075 221
total loss tensor(440.8175) tensor(1.9946)
None 11075


In [77]:
epochs = 40
training_runs = 400
batch_size = 96
context_length = 6
learning_rate = .1
embedding_dimensions = 16
num_heads = 4
head_size = embedding_dimensions // num_heads

print(head_size)

model = FFMultiHeadAttention(embedding_dimensions, context_length, num_heads, head_size)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

print(sum(p.numel() for p in model.parameters()), 'M parameters')

lmbda = lambda epoch: 0.98

m_scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    print("ep", ep, epoch_loss/training_runs, m_scheduler.get_last_lr())

4
4376 M parameters
ep 0 tensor(2.4780, grad_fn=<DivBackward0>) [0.095]
ep 1 tensor(2.1900, grad_fn=<DivBackward0>) [0.09025]
ep 2 tensor(2.1232, grad_fn=<DivBackward0>) [0.0857375]
ep 3 tensor(2.0793, grad_fn=<DivBackward0>) [0.08145062499999998]
ep 4 tensor(2.0554, grad_fn=<DivBackward0>) [0.07737809374999999]
ep 5 tensor(2.0423, grad_fn=<DivBackward0>) [0.07350918906249998]
ep 6 tensor(2.0290, grad_fn=<DivBackward0>) [0.06983372960937498]
ep 7 tensor(2.0244, grad_fn=<DivBackward0>) [0.06634204312890622]
ep 8 tensor(2.0121, grad_fn=<DivBackward0>) [0.0630249409724609]
ep 9 tensor(2.0048, grad_fn=<DivBackward0>) [0.05987369392383786]
ep 10 tensor(1.9992, grad_fn=<DivBackward0>) [0.05688000922764597]
ep 11 tensor(1.9920, grad_fn=<DivBackward0>) [0.05403600876626367]
ep 12 tensor(1.9844, grad_fn=<DivBackward0>) [0.05133420832795048]
ep 13 tensor(1.9816, grad_fn=<DivBackward0>) [0.04876749791155295]
ep 14 tensor(1.9787, grad_fn=<DivBackward0>) [0.046329123015975304]
ep 15 tensor(1.9780, 

In [78]:
print(split_loss(dev), len(dev))

idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(1):
    o = model.generate(idx, 100).data[0].tolist()
    print(names.decode(o))

num_batches 11075 221
total loss tensor(426.6404) tensor(1.9305)
None 11075

so
was alice bot by ran to lact the saout be he the don the knen  wen sail 


 you  alice and she cr


In [79]:
epochs = 60
training_runs = 400
batch_size = 96
context_length = 8
learning_rate = .1
embedding_dimensions = 32
num_heads = 4
head_size = embedding_dimensions // num_heads

print(head_size)

model = FFMultiHeadAttention(embedding_dimensions, context_length, num_heads, head_size)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

print(sum(p.numel() for p in model.parameters()), ' parameters')

lmbda = lambda epoch: 0.98

m_scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()

    if ep % 10 == 0:
        print("ep", ep, epoch_loss/training_runs, m_scheduler.get_last_lr()) 

8
14408  parameters
ep 0 tensor(2.3368, grad_fn=<DivBackward0>) [0.098]
ep 1 tensor(2.0300, grad_fn=<DivBackward0>) [0.09604]
ep 2 tensor(1.9516, grad_fn=<DivBackward0>) [0.0941192]
ep 3 tensor(1.9068, grad_fn=<DivBackward0>) [0.092236816]
ep 4 tensor(1.8827, grad_fn=<DivBackward0>) [0.09039207968]
ep 5 tensor(1.8580, grad_fn=<DivBackward0>) [0.0885842380864]
ep 6 tensor(1.8434, grad_fn=<DivBackward0>) [0.086812553324672]
ep 7 tensor(1.8243, grad_fn=<DivBackward0>) [0.08507630225817855]
ep 8 tensor(1.8145, grad_fn=<DivBackward0>) [0.08337477621301498]
ep 9 tensor(1.8152, grad_fn=<DivBackward0>) [0.08170728068875467]
ep 10 tensor(1.8017, grad_fn=<DivBackward0>) [0.08007313507497958]
ep 11 tensor(1.7947, grad_fn=<DivBackward0>) [0.07847167237347999]
ep 12 tensor(1.7863, grad_fn=<DivBackward0>) [0.07690223892601039]
ep 13 tensor(1.7802, grad_fn=<DivBackward0>) [0.07536419414749018]
ep 14 tensor(1.7716, grad_fn=<DivBackward0>) [0.07385691026454037]
ep 15 tensor(1.7748, grad_fn=<DivBackward

In [80]:
print(split_loss(dev), len(dev))

idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(1):
    o = model.generate(idx, 100).data[0].tolist()
    print(names.decode(o))

num_batches 11075 221
total loss tensor(371.4741) tensor(1.6809)
None 11075

you
nice    and there much you d i bles of eyes  the
pig   belcome    nother look       
set  if you


In [16]:
# allow mor loops without restarting
e_epochs = 10

In [None]:
epochs = 60
training_runs = 800
batch_size = 96
context_length = 12
learning_rate = .1
embedding_dimensions = 32
num_heads = 4
head_size = embedding_dimensions // num_heads

print(head_size)
# our embedding_dimensions are still 'small' so we mutliply the size our our feed forward network to make up
multiplier = 8
model = FFMultiHeadAttention(embedding_dimensions, context_length, num_heads, head_size, multiplier)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

print(sum(p.numel() for p in model.parameters()), ' parameters')

lmbda = lambda epoch: 0.98

m_scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    
    if ep % 10 == 0:
        print("ep", ep, epoch_loss/training_runs, m_scheduler.get_last_lr())

8
22856  parameters
ep 0 tensor(2.2369, grad_fn=<DivBackward0>) [0.098]
ep 10 tensor(1.6109, grad_fn=<DivBackward0>) [0.08007313507497958]
ep 20 tensor(1.5600, grad_fn=<DivBackward0>) [0.06542558123199924]
ep 30 tensor(1.5326, grad_fn=<DivBackward0>) [0.053457463299478813]
ep 40 tensor(1.5139, grad_fn=<DivBackward0>) [0.04367863958719317]
ep 50 tensor(1.4989, grad_fn=<DivBackward0>) [0.03568862864853744]


In [None]:
print(split_loss(dev), len(dev))

idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(1):
    o = model.generate(idx, 100).data[0].tolist()
    print(names.decode(o))