### Add an attention head to our sequence

with what we've learnt

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
%run bookreader.py

In [3]:
names = BookReader("names.txt")
vocab_size = names.vocab_size
vocab_size

27

### Get the batch with both x and y unseparated

In [5]:
def get_batch(data, batch_length=5, batch_size=5):
    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(len(data) - batch_length, (batch_size,))
    b = torch.stack([data[i:i+batch_length] for i in ix])
    return b

### Create an attention head

In [7]:
head_size = 7
channels = 3
context_length = 8

tril = torch.tril(torch.ones(context_length, context_length))

key = nn.Linear(channels, head_size, bias=False)
query = nn.Linear(channels, head_size, bias=False)
## add in 
value = nn.Linear(channels, head_size, bias=False)

In [11]:
batch_size = 4
x = torch.randn(batch_size, context_length, channels)

k = key(x)
# k = B, T, head_size  - 4, 8, 7
q = query(x)
# k = B, T, head_size

#the dot product tells us how much our query matches our key
kq_match = k @ q.transpose(-2,-1) * head_size**-0.5

In [14]:
masked_affin = kq_match.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(masked_affin, 1)

print(wei[0])

wei[0]

v = value(x)
out = wei @ v
out[0], out.shape

tensor([[0.1143, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0508, 0.1228, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1237, 0.1388, 0.1592, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1940, 0.1616, 0.1235, 0.1688, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1224, 0.1298, 0.1263, 0.2108, 0.2603, 0.0000, 0.0000, 0.0000],
        [0.0857, 0.1490, 0.2276, 0.2128, 0.2024, 0.3855, 0.0000, 0.0000],
        [0.0624, 0.1271, 0.2544, 0.2526, 0.1743, 0.4549, 0.7471, 0.0000],
        [0.2466, 0.1708, 0.1090, 0.1550, 0.3630, 0.1596, 0.2529, 1.0000]],
       grad_fn=<SelectBackward0>)


(tensor([[-0.0620, -0.1640,  0.0303,  0.1762, -0.0439, -0.0426, -0.1908],
         [-0.1111, -0.1128,  0.1580,  0.0957, -0.1416, -0.0184, -0.0864],
         [-0.1647, -0.1870,  0.2002,  0.2375, -0.2416, -0.0529, -0.2055],
         [-0.1197, -0.2294,  0.0875,  0.2996, -0.1501, -0.0720, -0.2907],
         [-0.0813, -0.2819, -0.0177,  0.3719, -0.0662, -0.0935, -0.3861],
         [-0.0538,  0.1168,  0.1120,  0.0348, -0.2339, -0.0070,  0.0639],
         [-0.3949,  0.4814,  0.8636, -0.0927, -1.1537,  0.0452,  0.4958],
         [ 0.4675,  0.1928, -0.9063,  0.1883,  0.5611, -0.0801, -0.2070]],
        grad_fn=<SelectBackward0>),
 torch.Size([4, 8, 7]))

In [4]:
class Head(nn.Module):

    def __init__(self, c, head_size, content_length):
        super().__init__()
        self.key = nn.Linear(c, head_size, bias=False)
        self.query = nn.Linear(c, head_size, bias=False)
        self.value = nn.Linear(c, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(content_length, content_length)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   
        q = self.query(x)
        
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        
        v = self.value(x) # (B,T,hs)
        out = wei @ v
        return out

### For a start we're going to feed a sequential model

ignore the positional encoding for a start so we can see what impact it has

In [5]:
train = torch.tensor(names.data[0])

def get_batch(data, batch_length=5, batch_size=5):
    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(len(data) - batch_length, (batch_size,))
    b = torch.stack([data[i:i+batch_length] for i in ix])
    return b

In [62]:
from collections import OrderedDict

nonlin = 'relu'

def create_model(embed_size, content_length):

    seq = OrderedDict([
        ('embed', nn.Embedding(vocab_size, embed_size)),
        ('attention', Head(embed_size, head_size)),
        ('decode', nn.Linear(head_size, vocab_size)),
    ])

    md = nn.Sequential(seq)
    
    return md

### First run through sequence

just like we had it before

In [63]:
epochs = 20
training_runs = 1000
batch_size = 24
context_length = 4
learning_rate = .1
embedding_dimensions = 9

model = create_model(embedding_dimensions, context_length)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # , weight_decay=args.weight_decay, betas=(0.9, 0.99), eps=1e-8)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits = model.forward(x)
        
        Y = y.reshape(-1)
        loss = F.cross_entropy(logits.view(-1, vocab_size), Y) # loss function

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print(epoch_loss/training_runs)
    

tensor(2.7617, grad_fn=<DivBackward0>)
tensor(2.5831, grad_fn=<DivBackward0>)
tensor(2.5330, grad_fn=<DivBackward0>)
tensor(2.5134, grad_fn=<DivBackward0>)
tensor(2.5002, grad_fn=<DivBackward0>)
tensor(2.4917, grad_fn=<DivBackward0>)
tensor(2.4869, grad_fn=<DivBackward0>)
tensor(2.4772, grad_fn=<DivBackward0>)
tensor(2.4701, grad_fn=<DivBackward0>)
tensor(2.4711, grad_fn=<DivBackward0>)
tensor(2.4674, grad_fn=<DivBackward0>)
tensor(2.4644, grad_fn=<DivBackward0>)
tensor(2.4624, grad_fn=<DivBackward0>)
tensor(2.4635, grad_fn=<DivBackward0>)
tensor(2.4667, grad_fn=<DivBackward0>)
tensor(2.4567, grad_fn=<DivBackward0>)
tensor(2.4563, grad_fn=<DivBackward0>)
tensor(2.4610, grad_fn=<DivBackward0>)
tensor(2.4535, grad_fn=<DivBackward0>)
tensor(2.4544, grad_fn=<DivBackward0>)


## OK it's doing something, what?

now we've got loss that looks similar to our bigram model 

and a thing to note is that the model at the moment is linear - just a big ole linear model

we're going to add in the position embedding now but the model will still be a linear one

In [6]:
class Attention(nn.Module):

    def __init__(self, embed_size, content_length):
        super().__init__()
        
        self.vocab_embed = nn.Embedding(vocab_size, embed_size)
        self.positional_embed = nn.Embedding(content_length, embed_size)
        self.attention = Head(embed_size, head_size, content_length)
        self.decode = nn.Linear(head_size, vocab_size)

    def forward(self, idx, targets):
        #idx B,T
        B, T = idx.shape

        idx_e = self.vocab_embed(idx)
        # note tr is always the same - so the learning here is information passed back to the positional_embed from loss
        tr = torch.arange(T)
        pos_e = self.positional_embed(tr)

        x = idx_e + pos_e
        x = self.attention(x)
        logits = self.decode(x)

        if targets is None:
            loss = None
        else:
            loss = F.cross_entropy(logits.view(B*T, -1), targets.resize(B*T)) # loss function

        return logits, loss

In [73]:
epochs = 20
training_runs = 1000
batch_size = 24
context_length = 4
learning_rate = .1
embedding_dimensions = 9

model = Attention(embedding_dimensions, context_length)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # , weight_decay=args.weight_decay, betas=(0.9, 0.99), eps=1e-8)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print(epoch_loss/training_runs)
    

tensor(2.7402, grad_fn=<DivBackward0>)
tensor(2.5816, grad_fn=<DivBackward0>)
tensor(2.5323, grad_fn=<DivBackward0>)
tensor(2.5098, grad_fn=<DivBackward0>)
tensor(2.5020, grad_fn=<DivBackward0>)
tensor(2.4873, grad_fn=<DivBackward0>)
tensor(2.4857, grad_fn=<DivBackward0>)
tensor(2.4797, grad_fn=<DivBackward0>)
tensor(2.4760, grad_fn=<DivBackward0>)
tensor(2.4748, grad_fn=<DivBackward0>)
tensor(2.4680, grad_fn=<DivBackward0>)
tensor(2.4647, grad_fn=<DivBackward0>)
tensor(2.4637, grad_fn=<DivBackward0>)
tensor(2.4602, grad_fn=<DivBackward0>)
tensor(2.4598, grad_fn=<DivBackward0>)
tensor(2.4551, grad_fn=<DivBackward0>)
tensor(2.4603, grad_fn=<DivBackward0>)
tensor(2.4542, grad_fn=<DivBackward0>)
tensor(2.4511, grad_fn=<DivBackward0>)
tensor(2.4558, grad_fn=<DivBackward0>)


### Hum we'll need something more...

we're still just learning a linear approximation of names - and the positional_encoding hasn't helped

attention has created a kind of lookup table but that's all

(lets come back and see if we can analyse *what* it's looking up later)

for a start lets add some non-linearity to all this

In [28]:
from collections import OrderedDict

class FeedForward(nn.Module):
    def __init__(self, fan_in, multiplier = 4):
        super().__init__()

        layers = OrderedDict([
            ("l_in", nn.Linear(fan_in, multiplier * fan_in)),
            ("relu", nn.ReLU()),
            ("l_out", nn.Linear(multiplier * fan_in, fan_in)),
        ])
        self.net = nn.Sequential(
            layers
        )

        initial = layers['l_in']
        nn.init.kaiming_normal_(initial.weight, nonlinearity="relu")
        layers['l_in'].weight.data = initial.weight.data * 3/5
        if initial.bias is not None:
            nn.init.constant_(initial.bias, 0)

        final = layers['l_out']
        layers['l_out'].weight.data = final.weight.data * .2
        if final.bias is not None:
            nn.init.constant_(final.bias, 0)

    def forward(self, x):
        return self.net(x)

In [239]:
epochs = 1
training_runs = 1000
batch_size = 48
context_length = 4
learning_rate = .2
head_size = 8
embedding_dimensions = 9

model = FFAttention(embedding_dimensions, head_size, context_length)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # , weight_decay=args.weight_decay, betas=(0.9, 0.99), eps=1e-8)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print(epoch_loss/training_runs)
    

attention out torch.Size([48, 4, 8])
feed forward out torch.Size([48, 4, 8])
logits out torch.Size([48, 4, 8])
targets torch.Size([48, 4])


TypeError: cannot unpack non-iterable NoneType object

In [31]:
class FFAttention(nn.Module):

    def __init__(self, embed_size, head_size, content_length):
        super().__init__()
        
        self.vocab_embed = nn.Embedding(vocab_size, embed_size)
        self.positional_embed = nn.Embedding(content_length, embed_size)
        self.attention = Head(embed_size, head_size, content_length)
        self.ff = FeedForward(head_size)
        self.decode = nn.Linear(head_size, vocab_size)
        self.content_length = content_length

    def forward(self, idx, targets=None):
        #idx B,T
        B, T = idx.shape

        idx_e = self.vocab_embed(idx)
        # note tr is always the same - so the learning here is information passed back to the positional_embed from loss
        tr = torch.arange(T)
        pos_e = self.positional_embed(tr)

        x = idx_e + pos_e
        x = self.attention(x)
        x = self.ff(x)
        logits = self.decode(x)

        if targets is None:
            loss = None
        else:
            loss = F.cross_entropy(logits.view(B*T, -1), targets.resize(B*T)) # loss function

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -self.content_length:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [221]:
epochs = 50
training_runs = 1000
batch_size = 48
context_length = 4
learning_rate = .2
embedding_dimensions = 9

model = FFAttention(embedding_dimensions, head_size, context_length)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # , weight_decay=args.weight_decay, betas=(0.9, 0.99), eps=1e-8)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print(epoch_loss/training_runs)
    

tensor(2.6730, grad_fn=<DivBackward0>)
tensor(2.4969, grad_fn=<DivBackward0>)
tensor(2.4687, grad_fn=<DivBackward0>)
tensor(2.4532, grad_fn=<DivBackward0>)
tensor(2.4413, grad_fn=<DivBackward0>)
tensor(2.4375, grad_fn=<DivBackward0>)
tensor(2.4308, grad_fn=<DivBackward0>)
tensor(2.4313, grad_fn=<DivBackward0>)
tensor(2.4242, grad_fn=<DivBackward0>)
tensor(2.4196, grad_fn=<DivBackward0>)
tensor(2.4154, grad_fn=<DivBackward0>)
tensor(2.4107, grad_fn=<DivBackward0>)
tensor(2.4114, grad_fn=<DivBackward0>)
tensor(2.4085, grad_fn=<DivBackward0>)
tensor(2.4060, grad_fn=<DivBackward0>)
tensor(2.4025, grad_fn=<DivBackward0>)
tensor(2.4023, grad_fn=<DivBackward0>)
tensor(2.3965, grad_fn=<DivBackward0>)
tensor(2.3984, grad_fn=<DivBackward0>)
tensor(2.3961, grad_fn=<DivBackward0>)
tensor(2.3971, grad_fn=<DivBackward0>)
tensor(2.3921, grad_fn=<DivBackward0>)
tensor(2.3880, grad_fn=<DivBackward0>)
tensor(2.3906, grad_fn=<DivBackward0>)
tensor(2.3877, grad_fn=<DivBackward0>)
tensor(2.3841, grad_fn=<D

In [222]:
idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(1):
    o = model.generate(idx, 100).data[0].tolist()
    print(names.decode(o))


maizzylin
kari
kendowero
denli
drala
kelyn
ali
lajyde
adce
alyarieztyna
trash
trabrannge
lakidyanty



## OK we're back to 'learning'

rather than remembering

lets add more heads and look at things

In [304]:
b, t, hs = 5, 4, 8
num_heads = 2
h1 = torch.zeros(b, t, hs)
h2 = torch.ones(b, t, hs)
cat_out = torch.cat([h1, h2], dim=-1)
print(cat_out.shape)
print(cat_out[0])
prog = torch.randn(16, 8)
prog_out = cat_out @ prog
prog_out.shape

torch.Size([5, 4, 16])
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.]])


torch.Size([5, 4, 8])

In [32]:
class MultiHead(nn.Module):

    def __init__(self, num_heads, head_size, embed_size, content_length):
        super().__init__()
        self.heads = nn.ModuleList([Head(embed_size, head_size, content_length) for _ in range(num_heads)])

    def forward(self, x):
        out = torch.cat( [head(x) for head in self.heads], dim = -1 )
        return out
    

In [35]:
class FFMultiHeadAttention(nn.Module):

    def __init__(self, embed_size, content_length, num_heads, head_size, multiplier=4):
        super().__init__()
        
        self.vocab_embed = nn.Embedding(vocab_size, embed_size)
        self.positional_embed = nn.Embedding(content_length, embed_size)
        self.mutli_attention = MultiHead(num_heads, head_size, embed_size, content_length)
        self.lna = nn.LayerNorm(embed_size)
        self.ff = FeedForward(embed_size, multiplier)
        self.lnff = nn.LayerNorm(embed_size)
        self.decode = nn.Linear(embed_size, vocab_size)
        self.content_length = content_length

    def forward(self, idx, targets=None):
        #idx B,T
        B, T = idx.shape

        idx_e = self.vocab_embed(idx)
        # note tr is always the same - so the learning here is information passed back to the positional_embed from loss
        tr = torch.arange(T)
        pos_e = self.positional_embed(tr)

        x = idx_e + pos_e
        x = self.mutli_attention(x)
        # print("multi ball out", x.shape)
        x = self.lna(x)
        x = self.ff(x)
        x = self.lnff(x)
        # print("feed forward out", x.shape)
        logits = self.decode(x)
        # print("decode out", x.shape)
        # print("targets", targets.shape)
        # return None, None

        if targets is None:
            loss = None
        else:
            targets = targets.reshape(B*T)
            loss = F.cross_entropy(logits.view(B*T, -1), targets) #.resize(B*T)) # loss function

        return logits, loss

    # we're restorspectively adding generate because we're getting genuinely better results here
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -self.content_length:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [10]:
import torch.optim as optim

In [303]:
epochs = 50
training_runs = 1000
batch_size = 200
context_length = 4
learning_rate = .1
embedding_dimensions = 16
num_heads = 2
head_size = embedding_dimensions // num_heads

print(head_size)

model = FFMultiHeadAttention(embedding_dimensions, context_length, num_heads, head_size)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

lmbda = lambda epoch: 0.95

m_scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    print(ep, epoch_loss/training_runs, m_scheduler.get_last_lr())
    

8
tensor(2.4375, grad_fn=<DivBackward0>)
tensor(2.3440, grad_fn=<DivBackward0>)
tensor(2.3268, grad_fn=<DivBackward0>)
tensor(2.3157, grad_fn=<DivBackward0>)
tensor(2.3104, grad_fn=<DivBackward0>)
tensor(2.3035, grad_fn=<DivBackward0>)
tensor(2.2976, grad_fn=<DivBackward0>)
tensor(2.2959, grad_fn=<DivBackward0>)
tensor(2.2940, grad_fn=<DivBackward0>)
tensor(2.2900, grad_fn=<DivBackward0>)
tensor(2.2881, grad_fn=<DivBackward0>)
tensor(2.2883, grad_fn=<DivBackward0>)
tensor(2.2869, grad_fn=<DivBackward0>)
tensor(2.2829, grad_fn=<DivBackward0>)
tensor(2.2812, grad_fn=<DivBackward0>)
tensor(2.2808, grad_fn=<DivBackward0>)
tensor(2.2790, grad_fn=<DivBackward0>)
tensor(2.2778, grad_fn=<DivBackward0>)
tensor(2.2763, grad_fn=<DivBackward0>)
tensor(2.2764, grad_fn=<DivBackward0>)
tensor(2.2753, grad_fn=<DivBackward0>)
tensor(2.2764, grad_fn=<DivBackward0>)
tensor(2.2728, grad_fn=<DivBackward0>)
tensor(2.2717, grad_fn=<DivBackward0>)
tensor(2.2738, grad_fn=<DivBackward0>)
tensor(2.2703, grad_fn=

In [11]:
def get_val_batch(data, batch_length=5, batch_size=5, i=0):
    # generate a small batch of data of inputs x and targets y
    if i == 0:
        ix = torch.randint(len(data) - batch_length, (batch_size,))
    else:
        ix = torch.arange(1, 5) + 1 + i

    b = torch.stack([data[i:i+batch_length] for i in ix])
    return b

print(get_val_batch(train, 5, 4, 5))
print(get_val_batch(train, 5, 4, 9))

get_batch(train, 5, 4)

tensor([[ 9, 22,  9,  1,  0],
        [22,  9,  1,  0,  1],
        [ 9,  1,  0,  1, 22],
        [ 1,  0,  1, 22,  1]])
tensor([[ 0,  1, 22,  1,  0],
        [ 1, 22,  1,  0,  9],
        [22,  1,  0,  9, 19],
        [ 1,  0,  9, 19,  1]])


tensor([[ 5, 25,  1,  0, 20],
        [ 5, 12,  5, 14,  0],
        [12, 12,  1,  0,  1],
        [ 3,  1,  9,  1,  8]])

In [48]:
import math
@torch.no_grad()
def split_loss(split):
    split_len = len(split)
    total_loss = 0
    batch_size = 50
    num_batches = math.floor(split_len / batch_size)
    print("num_batches", split_len, num_batches)
    
    model.eval()

    for i in range(num_batches):

        t_b = get_val_batch(split, context_length+1, batch_size, i*batch_size)
        
        x = t_b[:, 0: context_length]
        y = t_b[:, context_length: context_length+1]
        
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, batch_loss = model(x, y)
        
        total_loss = total_loss + batch_loss
    
    print("total loss", total_loss, total_loss / num_batches)

dev = torch.tensor(names.data[1])
print(split_loss(dev), len(dev))

num_batches 22225 444
total loss tensor(915.9062) tensor(2.0629)
None 22225


In [45]:
idx = torch.zeros((1, 1), dtype=torch.int)
for i in range(1):
    o = model.generate(idx, 100).data[0].tolist()
    print(names.decode(o))


lilano
sah
taciel
kyzariah
keidy
kimphiabellar
aaoluwey
aliah
nivaena
sutuel
reyer
cordence
jazelene


In [None]:
epochs = 0
training_runs = 1000
batch_size = 200
context_length = 6
learning_rate = .1
embedding_dimensions = 8
num_heads = 4
head_size = embedding_dimensions // num_heads

print(head_size)

model = FFMultiHeadAttention(embedding_dimensions, context_length, num_heads, head_size)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

print(sum(p.numel() for p in model.parameters()), 'M parameters')

lmbda = lambda epoch: 0.95

m_scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    print("ep", ep, epoch_loss/training_runs, m_scheduler.get_last_lr())

2
0.001267 M parameters
tensor(2.5082, grad_fn=<DivBackward0>)
tensor(2.3846, grad_fn=<DivBackward0>)
tensor(2.3578, grad_fn=<DivBackward0>)
tensor(2.3411, grad_fn=<DivBackward0>)
tensor(2.3345, grad_fn=<DivBackward0>)
tensor(2.3305, grad_fn=<DivBackward0>)
tensor(2.3254, grad_fn=<DivBackward0>)
tensor(2.3223, grad_fn=<DivBackward0>)
tensor(2.3193, grad_fn=<DivBackward0>)
tensor(2.3150, grad_fn=<DivBackward0>)
tensor(2.3152, grad_fn=<DivBackward0>)
tensor(2.3125, grad_fn=<DivBackward0>)
tensor(2.3139, grad_fn=<DivBackward0>)
tensor(2.3111, grad_fn=<DivBackward0>)
tensor(2.3095, grad_fn=<DivBackward0>)
tensor(2.3090, grad_fn=<DivBackward0>)
tensor(2.3089, grad_fn=<DivBackward0>)
tensor(2.3073, grad_fn=<DivBackward0>)
tensor(2.3068, grad_fn=<DivBackward0>)
tensor(2.3050, grad_fn=<DivBackward0>)
tensor(2.3051, grad_fn=<DivBackward0>)
tensor(2.3022, grad_fn=<DivBackward0>)
tensor(2.3015, grad_fn=<DivBackward0>)
tensor(2.3031, grad_fn=<DivBackward0>)
tensor(2.3016, grad_fn=<DivBackward0>)
t

In [13]:
epochs = 10
training_runs = 2000
batch_size = 200
context_length = 6
learning_rate = .1
embedding_dimensions = 16
num_heads = 4
head_size = embedding_dimensions // num_heads

print(head_size)

model = FFMultiHeadAttention(embedding_dimensions, context_length, num_heads, head_size)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

print(sum(p.numel() for p in model.parameters()), ' parameters')

lmbda = lambda epoch: 0.95

m_scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    print("ep", ep, epoch_loss/training_runs, m_scheduler.get_last_lr()) 

4
3947 M parameters




ep 0 tensor(2.3664, grad_fn=<DivBackward0>) [0.095]
ep 1 tensor(2.2730, grad_fn=<DivBackward0>) [0.09025]
ep 2 tensor(2.2543, grad_fn=<DivBackward0>) [0.0857375]
ep 3 tensor(2.2441, grad_fn=<DivBackward0>) [0.08145062499999998]
ep 4 tensor(2.2383, grad_fn=<DivBackward0>) [0.07737809374999999]
ep 5 tensor(2.2351, grad_fn=<DivBackward0>) [0.07350918906249998]
ep 6 tensor(2.2315, grad_fn=<DivBackward0>) [0.06983372960937498]
ep 7 tensor(2.2290, grad_fn=<DivBackward0>) [0.06634204312890622]
ep 8 tensor(2.2263, grad_fn=<DivBackward0>) [0.0630249409724609]
ep 9 tensor(2.2232, grad_fn=<DivBackward0>) [0.05987369392383786]


In [16]:
# allow mor loops without restarting
e_epochs = 10

In [37]:
epochs = 100
training_runs = 1800
batch_size = 80
context_length = 6
learning_rate = .1
embedding_dimensions = 16
num_heads = 4
head_size = embedding_dimensions // num_heads

print(head_size)
# our embedding_dimensions are still 'small' so we mutliply the size our our feed forward network to make up
multiplier = 8
model = FFMultiHeadAttention(embedding_dimensions, context_length, num_heads, head_size, multiplier)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

print(sum(p.numel() for p in model.parameters()), ' parameters')

lmbda = lambda epoch: 0.98

m_scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    print("ep", ep, epoch_loss/training_runs, m_scheduler.get_last_lr()) 

4
6059  parameters
ep 0 tensor(2.4389, grad_fn=<DivBackward0>) [0.098]
ep 1 tensor(2.3100, grad_fn=<DivBackward0>) [0.09604]
ep 2 tensor(2.2768, grad_fn=<DivBackward0>) [0.0941192]
ep 3 tensor(2.2598, grad_fn=<DivBackward0>) [0.092236816]
ep 4 tensor(2.2490, grad_fn=<DivBackward0>) [0.09039207968]
ep 5 tensor(2.2433, grad_fn=<DivBackward0>) [0.0885842380864]
ep 6 tensor(2.2362, grad_fn=<DivBackward0>) [0.086812553324672]
ep 7 tensor(2.2306, grad_fn=<DivBackward0>) [0.08507630225817855]
ep 8 tensor(2.2280, grad_fn=<DivBackward0>) [0.08337477621301498]
ep 9 tensor(2.2274, grad_fn=<DivBackward0>) [0.08170728068875467]
ep 10 tensor(2.2233, grad_fn=<DivBackward0>) [0.08007313507497958]
ep 11 tensor(2.2201, grad_fn=<DivBackward0>) [0.07847167237347999]
ep 12 tensor(2.2157, grad_fn=<DivBackward0>) [0.07690223892601039]
ep 13 tensor(2.2145, grad_fn=<DivBackward0>) [0.07536419414749018]
ep 14 tensor(2.2133, grad_fn=<DivBackward0>) [0.07385691026454037]
ep 15 tensor(2.2122, grad_fn=<DivBackward0

In [42]:
epochs = 180
training_runs = 1800
batch_size = 80
context_length = 6
learning_rate = .1
embedding_dimensions = 32
num_heads = 4
head_size = embedding_dimensions // num_heads

print(head_size)
# our embedding_dimensions are still 'small' so we mutliply the size our our feed forward network to make up
multiplier = 8
model = FFMultiHeadAttention(embedding_dimensions, context_length, num_heads, head_size, multiplier)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

print(sum(p.numel() for p in model.parameters()), ' parameters')

lmbda = lambda epoch: 0.98

m_scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, 1:context_length+1]

        logits, loss = model(x, y)

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    print("ep", ep, epoch_loss/training_runs, m_scheduler.get_last_lr()) 

8
21819  parameters
ep 0 tensor(2.3593, grad_fn=<DivBackward0>) [0.098]
ep 1 tensor(2.2435, grad_fn=<DivBackward0>) [0.09604]
ep 2 tensor(2.2128, grad_fn=<DivBackward0>) [0.0941192]
ep 3 tensor(2.1968, grad_fn=<DivBackward0>) [0.092236816]
ep 4 tensor(2.1837, grad_fn=<DivBackward0>) [0.09039207968]
ep 5 tensor(2.1722, grad_fn=<DivBackward0>) [0.0885842380864]
ep 6 tensor(2.1667, grad_fn=<DivBackward0>) [0.086812553324672]
ep 7 tensor(2.1582, grad_fn=<DivBackward0>) [0.08507630225817855]
ep 8 tensor(2.1544, grad_fn=<DivBackward0>) [0.08337477621301498]
ep 9 tensor(2.1489, grad_fn=<DivBackward0>) [0.08170728068875467]
ep 10 tensor(2.1454, grad_fn=<DivBackward0>) [0.08007313507497958]
ep 11 tensor(2.1416, grad_fn=<DivBackward0>) [0.07847167237347999]
ep 12 tensor(2.1395, grad_fn=<DivBackward0>) [0.07690223892601039]
ep 13 tensor(2.1354, grad_fn=<DivBackward0>) [0.07536419414749018]
ep 14 tensor(2.1346, grad_fn=<DivBackward0>) [0.07385691026454037]
ep 15 tensor(2.1309, grad_fn=<DivBackward