### Increase the corpus, read the paper

let's start reading more books

some stuff from reading the paper:

Was wondering what difference multi-head attention made to just having a bigger head at attention:

*Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this*

so apparently the softmax function is the problem and that's why the multihead attention is used

looking at optimizers - have been using straight SGD up to now - which seems good for small models, will look at using Adam

*5.3 Optimizer
 Weused the Adam optimizer [20] with β1 = 0.9, β2 = 0.98 and ϵ = 10−9. We varied the learning
 rate over the course of training, according to the formula:
 lrate = d−0.5
 model · min(step_num−0.5,step_num · warmup_steps−1.5)
 (3)
 This corresponds to increasing the learning rate linearly for the first warmup_steps training steps,
 and decreasing it thereafter proportionally to the inverse square root of the step number. We used
 warmup_steps = 4000.*


 look at adding Label Smoothing as per the paper

 *LabelSmoothing Duringtraining,weemployedlabelsmoothingofvalueϵls=0.1[36].This
 hurtsperplexity,asthemodellearnstobemoreunsure,butimprovesaccuracyandBLEUscore.*

 torch softmax has optional parameter: label_smoothing

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
%run bookreader.py

In [3]:
br = BookReader(False, r'[^\d+a-zA-Z \n?!:,]')
br.read("Middlemarch.txt", "The lifted veil.txt", "mill on the floss.txt", "brother jacob.txt")
vocab_size = br.vocab_size
vocab_size

68

In [4]:
# br = BookReader(False, r'[^\d+a-zA-Z \n?!:,]')
# br.read("Middlemarch.txt", "The lifted veil.txt", "mill on the floss.txt", "tiny_shakespeare.txt", "alice.txt")
# vocab_size = br.vocab_size
# vocab_size

### Get the batch with both x and y unseparated

In [5]:
def get_batch(data, batch_length=5, batch_size=5):
    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(len(data) - batch_length, (batch_size,))
    b = torch.stack([data[i:i+batch_length] for i in ix])
    return b

In [6]:
train = torch.tensor(br.data[0])

### Create an attention head

In [7]:
class Head(nn.Module):

    def __init__(self, c, head_size, content_length):
        super().__init__()
        self.key = nn.Linear(c, head_size, bias=False)
        self.query = nn.Linear(c, head_size, bias=False)
        self.value = nn.Linear(c, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(content_length, content_length)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   
        q = self.query(x)
        
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        
        v = self.value(x) # (B,T,hs)
        out = wei @ v
        return out

In [8]:
from collections import OrderedDict

class FeedForward(nn.Module):
    def __init__(self, fan_in, multiplier = 4):
        super().__init__()

        layers = OrderedDict([
            ("l_in", nn.Linear(fan_in, multiplier * fan_in)),
            ("relu", nn.ReLU()),
            ("l_out", nn.Linear(multiplier * fan_in, fan_in)),
        ])
        self.net = nn.Sequential(
            layers
        )

        initial = layers['l_in']
        nn.init.kaiming_normal_(initial.weight, nonlinearity="relu")
        layers['l_in'].weight.data = initial.weight.data * 3/5
        if initial.bias is not None:
            nn.init.constant_(initial.bias, 0)

        final = layers['l_out']
        layers['l_out'].weight.data = final.weight.data * .2
        if final.bias is not None:
            nn.init.constant_(final.bias, 0)

    def forward(self, x):
        return self.net(x)

In [9]:
import torch.optim as optim

In [10]:
class MultiHead(nn.Module):
    def __init__(self, num_heads, head_size, embed_size, content_length):
        super().__init__()
        self.heads = nn.ModuleList([Head(embed_size, head_size, content_length) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, embed_size)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return out

In [11]:
class AttentionBlock(nn.Module):
    def __init__(self, num_heads, head_size, embed_size, content_length, ff_mul):
        super().__init__()
        self.multihead = MultiHead(num_heads, head_size, embed_size, content_length)
        self.n1 = nn.LayerNorm(embed_size)
        self.ff = FeedForward(embed_size, ff_mul)
        self.n1 = nn.LayerNorm(embed_size)

    def forward(self, x):
        x_a = x.detach()
        x = self.multihead(x)
        x = x + x_a
        x = self.n1(x)
        x_b = x.detach()
        x = self.ff(x)
        x = x + x_b
        x = self.n1(x)

        return x

In [12]:
class FFMultiHeadAttention(nn.Module):

    def __init__(self, embed_size, content_length, num_heads, head_size, multiplier=4):
        super().__init__()
        
        self.vocab_embed = nn.Embedding(vocab_size, embed_size)
        self.positional_embed = nn.Embedding(content_length, embed_size)
        # self.mutli_attention = MultiHead(num_heads, head_size, embed_size, content_length)
        # self.lna = nn.LayerNorm(embed_size)
        # self.ff = FeedForward(embed_size, multiplier)
        # self.lnff = nn.LayerNorm(embed_size)
        self.atta = AttentionBlock(num_heads, head_size, embed_size, content_length, multiplier)
        self.attb = AttentionBlock(num_heads, head_size, embed_size, content_length, multiplier)
        self.decode = nn.Linear(embed_size, vocab_size)
        self.content_length = content_length

    def forward(self, idx, targets=None):
        #idx B,T
        B, T = idx.shape

        idx_e = self.vocab_embed(idx)
        # note tr is always the same - so the learning here is information passed back to the positional_embed from loss
        tr = torch.arange(T)
        pos_e = self.positional_embed(tr)

        x = idx_e + pos_e
        
        x = self.atta(x)
        x = self.attb(x)
        logits = self.decode(x)
        # print("decode out", x.shape)
        # print("targets", targets.shape)
        # return None, None

        if targets is None:
            loss = None
        else:
            targets = targets.reshape(B*T)
            loss = F.cross_entropy(logits.view(B*T, -1), targets) #.resize(B*T)) # loss function

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -self.content_length:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [13]:
def get_val_batch(data, batch_length=5, batch_size=5, i=0):
    # generate a small batch of data of inputs x and targets y
    if i == 0:
        ix = torch.randint(len(data) - batch_length, (batch_size,))
    else:
        ix = torch.arange(1, 5) + 1 + i

    b = torch.stack([data[i:i+batch_length] for i in ix])
    return b

In [14]:
import math
@torch.no_grad()
def split_loss(split):
    split_len = len(split)
    total_loss = 0
    batch_size = 50
    num_batches = math.floor(split_len / batch_size)
    print("num_batches", split_len, num_batches)
    
    model.eval()

    for i in range(num_batches):

        t_b = get_val_batch(split, context_length+1, batch_size, i*batch_size)
        
        x = t_b[:, 0: context_length]
        y = t_b[:, context_length: context_length+1]
        
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, batch_loss = model(x, y)
        
        total_loss = total_loss + batch_loss
    
    print("total loss", total_loss, total_loss / num_batches)


Increased content_length fixed skip connnection bug in Block

In [15]:
epochs = 20
training_runs = 800
batch_size = 96
context_length = 24
learning_rate = .1
embedding_dimensions = 32
num_heads = 4
head_size = embedding_dimensions // num_heads

print(head_size)
# our embedding_dimensions are still 'small' so we mutliply the size our our feed forward network to make up
multiplier = 4
model = FFMultiHeadAttention(embedding_dimensions, context_length, num_heads, head_size, multiplier)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

print(sum(p.numel() for p in model.parameters()), ' parameters')

lmbda = lambda epoch: 0.98

m_scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    
    if ep % 2 == 0:
        print("ep", ep, epoch_loss/training_runs, m_scheduler.get_last_lr())

8
30276  parameters
ep 0 tensor(2.5036, grad_fn=<DivBackward0>) [0.098]
ep 2 tensor(2.0582, grad_fn=<DivBackward0>) [0.0941192]
ep 4 tensor(1.9447, grad_fn=<DivBackward0>) [0.09039207968]
ep 6 tensor(1.8907, grad_fn=<DivBackward0>) [0.086812553324672]
ep 8 tensor(1.8606, grad_fn=<DivBackward0>) [0.08337477621301498]
ep 10 tensor(1.8366, grad_fn=<DivBackward0>) [0.08007313507497958]
ep 12 tensor(1.8189, grad_fn=<DivBackward0>) [0.07690223892601039]
ep 14 tensor(1.8033, grad_fn=<DivBackward0>) [0.07385691026454037]
ep 16 tensor(1.7879, grad_fn=<DivBackward0>) [0.07093217661806457]
ep 18 tensor(1.7777, grad_fn=<DivBackward0>) [0.06812326242398921]


Before fix performance

ep 18 tensor(1.8988, grad_fn=<DivBackward0>) [0.06812326242398921]

In [16]:
dev = torch.tensor(br.data[1])

In [17]:
print(split_loss(dev), len(dev))

idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(1):
    o = model.generate(idx, 100).data[0].tolist()
    print(br.decode(o))

num_batches 296550 5931
total loss tensor(10516.9697) tensor(1.7732)
None 296550

I affer he uneliars no if at are,  on this me will s in a divo own  
 Deesal uddent of dight matings


In [18]:
idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(1):
    o = model.generate(idx, 1000).data[0].tolist()
    print(br.decode(o))


 Then a esire by selvusionstirinable aggo, in kins presence of was  elsegsing of oppocience morre has boching eavide   

 Einr his felloth  soely me oust an any are, dulad worging an to whe to been that yge he stnase well aling of could goodicatine a und so her lught
pronrite obsher, and go minunt and telt
re
gere utsudderfor other got
ill had no uletty  seemalty?
sa
creave 
snd the not know he Fin well to and
on hinds the
favity, as pure him in eyo  a repaly playing a s from has but sheg will by vinfs if theade very aguety unhpusical at I challe of have ords at her he Comcureed thing coulded
yoir her nor been and abor
auid beyod with nentors of
that lagrent chream more only go wut me in the gaved,iece dhe takigness in accort day gin be going to the
the pay resirant ilefty band re
oung irlss of
coor, much with decluness when
I he
had Mr  F Lydgate, that Daston that shopple going nessieng be to that he would mope, to that posen a
builk them fres ruving that eviescame as soming would:  

In [19]:
e_epochs = 60

In [20]:
for ep in range(e_epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    
    if ep % 2 == 0:
        print("ep", ep, epoch_loss/training_runs, m_scheduler.get_last_lr())

ep 0 tensor(1.7695, grad_fn=<DivBackward0>) [0.06542558123199924]
ep 2 tensor(1.7617, grad_fn=<DivBackward0>) [0.06283472821521206]
ep 4 tensor(1.7555, grad_fn=<DivBackward0>) [0.06034647297788966]
ep 6 tensor(1.7487, grad_fn=<DivBackward0>) [0.05795675264796523]
ep 8 tensor(1.7412, grad_fn=<DivBackward0>) [0.055661665243105805]
ep 10 tensor(1.7372, grad_fn=<DivBackward0>) [0.053457463299478813]
ep 12 tensor(1.7308, grad_fn=<DivBackward0>) [0.05134054775281945]
ep 14 tensor(1.7243, grad_fn=<DivBackward0>) [0.0493074620618078]
ep 16 tensor(1.7203, grad_fn=<DivBackward0>) [0.04735488656416021]
ep 18 tensor(1.7161, grad_fn=<DivBackward0>) [0.04547963305621946]
ep 20 tensor(1.7135, grad_fn=<DivBackward0>) [0.04367863958719317]
ep 22 tensor(1.7067, grad_fn=<DivBackward0>) [0.04194896545954032]
ep 24 tensor(1.7054, grad_fn=<DivBackward0>) [0.04028778642734252]
ep 26 tensor(1.7014, grad_fn=<DivBackward0>) [0.038692390084819756]
ep 28 tensor(1.6983, grad_fn=<DivBackward0>) [0.03716017143746089

In [23]:
print(split_loss(dev), len(dev))

idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(1):
    o = model.generate(idx, 300).data[0].tolist()
    print(br.decode(o))

num_batches 296550 5931
total loss tensor(9868.3203) tensor(1.6639)
None 296550

expect
condation  Ramould poncepty 
think once the mind he again!  said
My couldn aloned not a voin
in a
sightt  Ah, add Mr  Brob had
depoce for him at cress  I have this never the gettenous was give if Alvident, but
in away  and me is again, and inwally spicely besistainly time of leggep  expersenc


In [24]:
for ep in range(e_epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    
    if ep % 2 == 0:
        print("ep", ep, epoch_loss/training_runs, m_scheduler.get_last_lr())

ep 0 tensor(1.6629, grad_fn=<DivBackward0>) [0.019467587308039984]
ep 2 tensor(1.6606, grad_fn=<DivBackward0>) [0.0186966708506416]
ep 4 tensor(1.6591, grad_fn=<DivBackward0>) [0.017956282684956193]
ep 6 tensor(1.6594, grad_fn=<DivBackward0>) [0.017245213890631928]
ep 8 tensor(1.6570, grad_fn=<DivBackward0>) [0.016562303420562904]
ep 10 tensor(1.6560, grad_fn=<DivBackward0>) [0.01590643620510861]
ep 12 tensor(1.6558, grad_fn=<DivBackward0>) [0.015276541331386308]
ep 14 tensor(1.6522, grad_fn=<DivBackward0>) [0.01467159029466341]
ep 16 tensor(1.6525, grad_fn=<DivBackward0>) [0.014090595318994738]
ep 18 tensor(1.6531, grad_fn=<DivBackward0>) [0.013532607744362546]
ep 20 tensor(1.6511, grad_fn=<DivBackward0>) [0.012996716477685789]
ep 22 tensor(1.6490, grad_fn=<DivBackward0>) [0.01248204650516943]
ep 24 tensor(1.6510, grad_fn=<DivBackward0>) [0.011987757463564721]
ep 26 tensor(1.6473, grad_fn=<DivBackward0>) [0.011513042268007558]
ep 28 tensor(1.6470, grad_fn=<DivBackward0>) [0.0110571257

In [27]:
print(split_loss(dev), len(dev))

idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(2):
    o = model.generate(idx, 300).data[0].tolist()
    print(br.decode(o))

num_batches 296550 5931
total loss tensor(9643.2451) tensor(1.6259)
None 296550

line eady s hament herows  This own look mill to halve only asay same in Mrs Glegging aunting that men stagreons are would might in 

It s
Mr  Philip with libless,
thought with new
that  Nob: experses the morride a made dide  thole such her a seeat, and  My lench, even a remate converium
and that at

Dave any
knowinced and refuse because of perhenself Mr Tullibably annisgriven with heash, to the dorse of you
will boid Found
idracted ove sort feegalf entflarce a so lost at the bat seed to the Cerfaction for subjust her    Co chare caused
other preprest you contry obscious, cases,  other man, but,


In [32]:
print(split_loss(dev), len(dev))

idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(2):
    o = model.generate(idx, 300).data[0].tolist()
    print(br.decode(o))

num_batches 339325 6786
total loss tensor(11422.1123) tensor(1.6832)
None 339325

Mr  Brooke a wore I know, now wness sot could be before and a ade a rear was sir like, you as seefually
soxpeactage   any undering was solition bhcs, I go  and Lydgate, wish, a suppition, Perfuiet in did if found him? 
 Yes,
it use looke,  when Maggie posted, Bne with
hard of expose to Pippe of ext 

vising that she had do,  said Cadpositates, as a teat he forkness away other that solk havet,  and sportne of obly into you like their
 mall, uncorners if prepabacument at you at he saw Pianted ought, those found 
Maggie, defivyd looking held awaind main foliced a man? 
  and hard of idiofe  But an 


In [25]:
torch.save(model, "elliot_checkpoint")

In [26]:
for ep in range(e_epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    
    if ep % 2 == 0:
        print("ep", ep, epoch_loss/training_runs, m_scheduler.get_last_lr())

ep 0 tensor(1.6380, grad_fn=<DivBackward0>) [0.005792641784140534]
ep 2 tensor(1.6350, grad_fn=<DivBackward0>) [0.005563253169488569]
ep 4 tensor(1.6360, grad_fn=<DivBackward0>) [0.005342948343976821]
ep 6 tensor(1.6346, grad_fn=<DivBackward0>) [0.005131367589555339]
ep 8 tensor(1.6355, grad_fn=<DivBackward0>) [0.004928165433008947]
ep 10 tensor(1.6322, grad_fn=<DivBackward0>) [0.004733010081861793]
ep 12 tensor(1.6329, grad_fn=<DivBackward0>) [0.004545582882620065]
ep 14 tensor(1.6322, grad_fn=<DivBackward0>) [0.00436557780046831]
ep 16 tensor(1.6327, grad_fn=<DivBackward0>) [0.004192700919569765]
ep 18 tensor(1.6321, grad_fn=<DivBackward0>) [0.004026669963154802]
ep 20 tensor(1.6311, grad_fn=<DivBackward0>) [0.003867213832613871]
ep 22 tensor(1.6311, grad_fn=<DivBackward0>) [0.0037140721648423617]
ep 24 tensor(1.6306, grad_fn=<DivBackward0>) [0.003566994907114604]
ep 26 tensor(1.6293, grad_fn=<DivBackward0>) [0.0034257419087928656]
ep 28 tensor(1.6286, grad_fn=<DivBackward0>) [0.0032

In [28]:
print(split_loss(dev), len(dev))

idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(2):
    o = model.generate(idx, 300).data[0].tolist()
    print(br.decode(o))

num_batches 296550 5931
total loss tensor(9646.5020) tensor(1.6265)
None 296550

reparent  But twe Middle, and think and
Charth saterful, with who
have ammemoa vincy get, and tricise  There like hurtain iddred with
 Stip  

Hasy than might about in the think and with had but to lious
obliging, difficultic and
as knelices: a come to on the highint  get yet by decisardly  He
thoug

The sure all gaver
poy help angitiong, an
opeve fellow, been to tell childing suck  I m to to other laing bit no
had triet I
corlong taken the seemed and Maggie for hards by seady worth, retainess, busins lafe his welt
arding
and    said might 
shem colles strangus you seen can in that ispose sight,
