### Go back though embedding and onto sequence

with what we've learnt

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
%run bookreader.py

In [3]:
names = BookReader("names.txt")
vocab_size = names.vocab_size
vocab_size

27

### Get the batch with both x and y unseparated

In [4]:
def get_batch(data, batch_length=5, batch_size=5):
    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(len(data) - batch_length, (batch_size,))
    b = torch.stack([data[i:i+batch_length] for i in ix])
    return b

### OK so bring back embeddings

now we'll have full batch BTC or however you want to name the dimensions

and then we're going to add another dimension because eventually we'll be creating multiple examples from each example

In [5]:
embedding_dimensions = 2
em = nn.Embedding(vocab_size, embedding_dimensions)

train = torch.tensor(names.data[0])

context_length = 4
y_l = 1
batch_size = 3
t_b = get_batch(train, context_length+y_l, batch_size)

x = t_b[:, 0:context_length]
y = t_b[:, 1:context_length+1]

x = x.repeat(1, context_length)
tr = torch.tril(torch.ones((batch_size, context_length, context_length), dtype=torch.int))

multi = x.view(batch_size, context_length, context_length) * tr

embedded_batch = em(multi)
print(embedded_batch.shape, embedded_batch[0])
y[0]

torch.Size([3, 4, 4, 2]) tensor([[[ 1.2590, -0.4949],
         [ 1.2590, -0.4949],
         [ 1.2590, -0.4949],
         [ 1.2590, -0.4949]],

        [[ 1.2590, -0.4949],
         [ 2.4729,  0.1094],
         [ 1.2590, -0.4949],
         [ 1.2590, -0.4949]],

        [[ 1.2590, -0.4949],
         [ 2.4729,  0.1094],
         [ 1.0872,  0.9255],
         [ 1.2590, -0.4949]],

        [[ 1.2590, -0.4949],
         [ 2.4729,  0.1094],
         [ 1.0872,  0.9255],
         [-0.2380, -0.4213]]], grad_fn=<SelectBackward0>)


tensor([11,  8,  1, 12])

## our added dimension
its really just increasing the batch size, so the batch is now batch_size * context_length

so our B, T, C is right now - and B = batch_size * context_length which match our Y which is also = batch_size * context_length

In [7]:
large_batch = embedded_batch.view(batch_size * context_length, context_length, embedding_dimensions)
large_batch.shape

torch.Size([12, 4, 2])

### and we're going to feed a sequential model

we want this to be available in our model, or a layer, but for now:

In [8]:
def triling(xs, batch_size, context_length):
    tril = torch.tril(torch.ones((batch_size, context_length, context_length), dtype=torch.int))
    x = xs.repeat(1, 1, context_length)
    print("x repeat", x.shape)
    x = x.view(-1, context_length, context_length)
    x = x * tril
    print("x to multiple", x.shape)
    return x.view(-1, context_length)

### lets run through what we're doing first

we're going to embed our batch - but this messes up our dimensions on output

we need to add a flatten layer to the model to deal with this

In [12]:
batch_size = 3
context_length = 4
embedding_dimensions = 2

es = nn.Embedding(vocab_size, embedding_dimensions)
fl = nn.Flatten(1)
t_b = get_batch(train, context_length+1, batch_size)

l1 = nn.Linear(embedding_dimensions * context_length, 50, bias=True)

x = t_b[:, 0:context_length]
x = triling(x, batch_size, context_length)
print("examples from triling", x.shape)

embedded = es(x)
## this is what we did in the sequence layer
print(embedded.view(-1, embedding_dimensions * context_length).shape)
# flatten will do the same in our model
fl(embedded).shape

x repeat torch.Size([1, 3, 16])
x to multiple torch.Size([3, 4, 4])
examples from triling torch.Size([12, 4])
torch.Size([12, 8])


torch.Size([12, 8])

In [15]:
from collections import OrderedDict

nonlin = 'relu'

def create_model(context_length, hidden_size, embed_size):

    seq = OrderedDict([
        ('embed', nn.Embedding(vocab_size, embed_size)),
        ('flatten', nn.Flatten(1)),
        ('initial', nn.Linear(embed_size * context_length, hidden_size, bias=True)),
        ('relu1', nn.ReLU()),
        ('final', nn.Linear(hidden_size, vocab_size, bias=True)),
    ])

    initial = seq['initial']
    nn.init.kaiming_normal_(initial.weight, nonlinearity=nonlin)
    seq['initial'].weight.data = initial.weight.data * 3/5
    if initial.bias is not None:
        nn.init.constant_(initial.bias, 0)

    final = seq['final']
    seq['final'].weight.data = final.weight.data * 0.2
    md = nn.Sequential(seq)
    
    return md

### First run through sequence

just like we had it before without the extra batch stuff

In [16]:
epochs = 20
training_runs = 1000
batch_size = 820
context_length = 4
learning_rate = .1
embedding_dimensions = 2

model = create_model(4, 60, 2)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # , weight_decay=args.weight_decay, betas=(0.9, 0.99), eps=1e-8)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]

        logits = model.forward(x)
        # cross_entropy wants to know the number of classes - in our case vocab_size
        logits = logits.view(-1, vocab_size)

        y = t_b[:, context_length:context_length+1]
        Y = y.reshape(-1)

        loss = F.cross_entropy(logits, Y) # loss function

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print(epoch_loss/training_runs)
    

tensor(2.6684, grad_fn=<DivBackward0>)
tensor(2.4743, grad_fn=<DivBackward0>)
tensor(2.4072, grad_fn=<DivBackward0>)
tensor(2.3702, grad_fn=<DivBackward0>)
tensor(2.3481, grad_fn=<DivBackward0>)
tensor(2.3291, grad_fn=<DivBackward0>)
tensor(2.3186, grad_fn=<DivBackward0>)
tensor(2.3046, grad_fn=<DivBackward0>)
tensor(2.2977, grad_fn=<DivBackward0>)
tensor(2.2921, grad_fn=<DivBackward0>)
tensor(2.2856, grad_fn=<DivBackward0>)
tensor(2.2808, grad_fn=<DivBackward0>)
tensor(2.2759, grad_fn=<DivBackward0>)
tensor(2.2713, grad_fn=<DivBackward0>)
tensor(2.2717, grad_fn=<DivBackward0>)
tensor(2.2666, grad_fn=<DivBackward0>)
tensor(2.2641, grad_fn=<DivBackward0>)
tensor(2.2621, grad_fn=<DivBackward0>)
tensor(2.2613, grad_fn=<DivBackward0>)
tensor(2.2560, grad_fn=<DivBackward0>)


OK that looks reasonable

now lets look at our tril stuff again

from sequence we had

get_xys(names[:2])
([[0, 0, 0, 0, 0],
  [0, 0, 0, 0, 1],
  [0, 0, 0, 1, 13],
  [0, 0, 1, 13, 25],
  [0, 1, 13, 25, 1],
  [1, 13, 25, 1, 8],
  [0, 0, 0, 0, 0],
  [0, 0, 0, 0, 26],
  [0, 0, 0, 26, 1],
  [0, 0, 26, 1, 13],
  [0, 26, 1, 13, 15],
  [26, 1, 13, 15, 18],
  [1, 13, 15, 18, 1]],
 [1, 13, 25, 1, 8, 0, 26, 1, 13, 15, 18, 1, 0])

is that what our tril does exactly

In [37]:
t_b = get_batch(train, context_length+1, 2)

x = t_b[:, 0:context_length]
y = t_b[:, 1:context_length+1]

print("the batch and 'extended' labels")
print(x)
print(y)

x = x.repeat(1, 1, context_length)
print("x repeat", x.shape)
x = x.view(-1, context_length, context_length)
print("x reshape", x.shape)
print(x)

the batch and 'extended' labels
tensor([[14,  7,  5, 12],
        [25,  1,  0, 12]])
tensor([[ 7,  5, 12,  9],
        [ 1,  0, 12,  5]])
x repeat torch.Size([1, 2, 16])
x reshape torch.Size([2, 4, 4])
tensor([[[14,  7,  5, 12],
         [14,  7,  5, 12],
         [14,  7,  5, 12],
         [14,  7,  5, 12]],

        [[25,  1,  0, 12],
         [25,  1,  0, 12],
         [25,  1,  0, 12],
         [25,  1,  0, 12]]])


we've got copies of both our batch examples

In [39]:
tril = torch.tril(torch.ones((2, context_length, context_length), dtype=torch.int))
nb = x * tril
nb = nb.view(-1, context_length)
print(nb.shape)
nb, y

torch.Size([8, 4])


(tensor([[14,  0,  0,  0],
         [14,  7,  0,  0],
         [14,  7,  5,  0],
         [14,  7,  5, 12],
         [25,  0,  0,  0],
         [25,  1,  0,  0],
         [25,  1,  0,  0],
         [25,  1,  0, 12]]),
 tensor([[ 7,  5, 12,  9],
         [ 1,  0, 12,  5]]))

so those are our new examples and they all have labels

we'd need to shift rows right by their number of zeros to match what we had but does that matter?

the characters don't talk to each other - we just present sequences and a label

In [50]:
import torch.optim as optim

epochs = 180
training_runs = 800
batch_size = 220
context_length = 5
learning_rate = .1
embedding_dimensions = 7
hidden_size = 200

tril = torch.tril(torch.ones((batch_size, context_length, context_length), dtype=torch.int))
tril_batch_size = batch_size * context_length

model = create_model(context_length, hidden_size, embedding_dimensions)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

lmbda = lambda epoch: 0.95

m_scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 160], gamma=0.1)

for ep in range(epochs):
    epoch_loss = 0
    for tr in range(training_runs):
        t_b = get_batch(train, context_length+1, batch_size)

        x = t_b[:, 0:context_length]
        y = t_b[:, context_length:context_length+1]
        # print("initial", x.shape, y.shape)
        
        X = x.repeat(1, 1, context_length).view(-1, context_length, context_length)
        ex = (X * tril).view(-1, context_length)
        
        y = y.reshape(-1)
        
        logits = model.forward(x)
        # cross_entropy wants to know the number of classes - in our case vocab_size
        logits = logits.view(-1, vocab_size)
        
        Y = y.reshape(-1)
        # print("cross entropy inputs", logits.shape, Y.shape)
        loss = F.cross_entropy(logits, Y) # loss function

        epoch_loss += loss
        
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    m_scheduler.step()
    scheduler.step()
    print(epoch_loss/training_runs, m_scheduler.get_last_lr())
    

tensor(2.2957, grad_fn=<DivBackward0>) [0.095]
tensor(2.1584, grad_fn=<DivBackward0>) [0.09025]
tensor(2.1238, grad_fn=<DivBackward0>) [0.0857375]
tensor(2.0929, grad_fn=<DivBackward0>) [0.08145062499999998]
tensor(2.0815, grad_fn=<DivBackward0>) [0.07737809374999999]
tensor(2.0658, grad_fn=<DivBackward0>) [0.07350918906249998]
tensor(2.0575, grad_fn=<DivBackward0>) [0.06983372960937498]
tensor(2.0510, grad_fn=<DivBackward0>) [0.06634204312890622]
tensor(2.0432, grad_fn=<DivBackward0>) [0.0630249409724609]
tensor(2.0334, grad_fn=<DivBackward0>) [0.05987369392383786]
tensor(2.0337, grad_fn=<DivBackward0>) [0.05688000922764597]
tensor(2.0204, grad_fn=<DivBackward0>) [0.05403600876626367]
tensor(2.0210, grad_fn=<DivBackward0>) [0.05133420832795048]
tensor(2.0182, grad_fn=<DivBackward0>) [0.04876749791155295]
tensor(2.0184, grad_fn=<DivBackward0>) [0.046329123015975304]
tensor(2.0168, grad_fn=<DivBackward0>) [0.04401266686517654]
tensor(2.0016, grad_fn=<DivBackward0>) [0.04181203352191771]

In [51]:
print(epoch_loss/training_runs)

tensor(1.9498, grad_fn=<DivBackward0>)


In [47]:
print(epoch_loss/training_runs)

tensor(2.2267, grad_fn=<DivBackward0>)


In [120]:
def get_val_batch(data, batch_length=5, batch_size=5, i=0):
    # generate a small batch of data of inputs x and targets y
    if i == 0:
        ix = torch.randint(len(data) - batch_length, (batch_size,))
    else:
        ix = torch.arange(1, 5) + 1 + i

    b = torch.stack([data[i:i+batch_length] for i in ix])
    return b

print(get_val_batch(train, 5, 4, 5))
print(get_val_batch(train, 5, 4, 9))

get_batch(train, 5, 4)

tensor([[ 3,  8,  1, 18, 12],
        [ 8,  1, 18, 12, 15],
        [ 1, 18, 12, 15, 20],
        [18, 12, 15, 20, 20]])
tensor([[12, 15, 20, 20,  5],
        [15, 20, 20,  5,  0],
        [20, 20,  5,  0, 13],
        [20,  5,  0, 13,  9]])


tensor([[ 7,  9,  5, 13,  1],
        [ 1, 22,  9,  5, 18],
        [12, 15, 14,  1,  0],
        [ 5, 25,  0, 20,  1]])

In [134]:
import math
@torch.no_grad()
def split_loss(split):
    split_len = len(split)
    total_loss = 0
    batch_size = 50
    num_batches = math.floor(split_len / batch_size)
    print("num_batches", split_len, num_batches)

    for i in range(num_batches):

        t_b = get_val_batch(split, context_length+1, batch_size, i*batch_size)
        
        x = t_b[:, 0: context_length]
        y = t_b[:, context_length: context_length+1]
        
        Y = y.reshape(-1)
        
        logits = model(x)
        
        batch_loss = F.cross_entropy(logits, Y) # loss function
        
        if batch_loss < 100:
            total_loss = total_loss + batch_loss
    
    print("total loss", total_loss, total_loss / num_batches)

dev = torch.tensor(names.data[1])
print(split_loss(dev), len(dev))

test = torch.tensor(names.data[2])
split_loss(test)

num_batches 22875 457
total loss tensor(940.1379) tensor(2.0572)
None 22875
num_batches 16600 332
total loss tensor(702.2509) tensor(2.1152)


In [None]:
def generate_names(num_names):
    for i in range(num_names):
        out = []
        ix = [0, 0, 0, 0, 0]
        for nl in range(10):
            xenc = es[ix]

            outputL1 = torch.relu(xenc.view(-1, dims * context_length) @ W1 + b1)
            logits = outputL1 @ W2 + b2
            
            p = F.softmax(logits, dim=1)
    
            # ## torch.multinomial pulls out an index in p (num_samples=1) by sampling from the elements in p according to their probabilities 
            # ## (p is normalized in the softmax above)
            prediction = torch.multinomial(p, num_samples=1).item()
            
            ix = [ix[1], ix[2], ix[3], ix[4], prediction]

            if prediction == 0:
                break
            out.append(itos[prediction])
            
        print("".join(out))

generate_names(1)