## Part 1 - Building the Base Model

We will build a base Bigram model to make simple token prediction

### Get the Dataset to train on

In [23]:
# First lets get the tiny shakespeare dataset - uncomment to download again!
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


In [24]:
# Read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [25]:
# check the length of the text
print ("length of dataset in chars: ", len(text))

length of dataset in chars:  1115394


In [26]:
# Lets check the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [27]:
# Find the unique characters that occur in the text
chars = sorted(list(set(text)))
vocabulary_size = len(chars)
print(''.join(chars))
print(vocabulary_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


#### Tokenizer

Create a Simple Tokenizer

In [28]:
# This is a very simple tokenizer that maps characters to integers and vice versa
# Look at the other tokenizers that are used.
# Google uses SentencePiece.  OpenAI uses tiktoken.
str_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_str = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [str_to_int[ch] for ch in s]
decode = lambda x: ''.join([int_to_str[i] for i in x]) 


In [29]:
print(encode("helloooo there"))
print(decode(encode("helloooo there")))

[46, 43, 50, 50, 53, 53, 53, 53, 1, 58, 46, 43, 56, 43]
helloooo there


#### Encode the Dataset

Use the tokenizer to encode the dataset

In [30]:
# Now that we have a tokenizer, let's encode the entire dataset 
# first we need to store in a torch.Tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])  # this is the same first 1000 characters we printed earlier

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [31]:
# Separate data set into training and validation splits
# We will use 90% of the data for training and 10% for validation (change n for different percentages)
n = int(0.9*len(data))
train_data = data[:n]
validation_data = data[n:]

In [32]:
# Set maximum length (block size)
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [33]:
# Inputs to the transformer
x = train_data[:block_size]

# the target
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]  # all the inputs up to time t
    target = y[t] # the target at time t
    print(f"When the input is {context} the target is {target}")



When the input is tensor([18]) the target is 47
When the input is tensor([18, 47]) the target is 56
When the input is tensor([18, 47, 56]) the target is 57
When the input is tensor([18, 47, 56, 57]) the target is 58
When the input is tensor([18, 47, 56, 57, 58]) the target is 1
When the input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
When the input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
When the input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [34]:
torch.manual_seed(1337)
batch_size = 4 # How many independent sequences to train on in parallel
block_size = 8 # The maximum context length for predictions

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
     data = train_data if split == 'train' else validation_data

    # pick a random starting point in the data (batch_size number of random offsets) 
     ix = torch.randint(len(data) - block_size, (batch_size,))
    
    # batch inputs (x) and targets (y)
     x = torch.stack([data[i:i+block_size] for i in ix])
     y = torch.stack([data[i+1:i+block_size+1] for i in ix])
     
     return x, y


xb, yb = get_batch('train')
print("inputs:")
print(xb.shape)
print(xb)
print('targets')
print(yb.shape)
print(yb)
print('----------')

for b in range(batch_size):
     for t in range(block_size):
         context = xb[b, :t+1]
         target = yb[b, t]
         print(f"When the input is {context} the target is {target}")



inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----------
When the input is tensor([24]) the target is 43
When the input is tensor([24, 43]) the target is 58
When the input is tensor([24, 43, 58]) the target is 5
When the input is tensor([24, 43, 58,  5]) the target is 57
When the input is tensor([24, 43, 58,  5, 57]) the target is 1
When the input is tensor([24, 43, 58,  5, 57,  1]) the target is 46
When the input is tensor([24, 43, 58,  5, 57,  1, 46]) the target is 43
When the input is tensor([24, 43, 58,  5, 57,  1, 46, 43]) the target is 39
When the input is tensor([44]) the target is 53
When the input is tensor([44, 53]) the target is 56
Whe

### Bigram Language Model
Simplest possible neural network

In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targers are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)

        if targets is None:
            loss = None
        else:
            # reshape logits so we can use cross-entropy
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)  # -1 means "infer this dimension" (translates to B*T)

            # evaluate the loss function (quality of predictions)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
    # idx is a (B, T) array of indices in the current 
    # the job of generate is to take a BxT and return a BxT+1, +2, +3, etc

        for _ in range(max_new_tokens):
            
            # Get the predictions
            logits, loss = self(idx)

            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)

            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=1)    # (B, C)

            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx
        
    
m = BigramLanguageModel(vocabulary_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

# if we calculate loss we're expecting -ln(1/65) which is close to what our value is


torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


In [36]:
idx = torch.zeros((1,1), dtype=torch.long) # creating a 1x1 tensor of zeros (remember 0 = new line)

# ask for 100 new tokens, generate, convert to list to feed into decode
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

#context = torch.zeros((1, 1), dtype=torch.long)
#output = m.generate(context, max_new_tokens=100)
#print(output)
#print(decode(output[0]))


SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


Note that the output is garbage because the model is untrained.  Lets run the next few steps and do this again after training.

#### Create an Optimizer


In [37]:

# Create pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3) # I think this is .001
# basically take gradients and update based on loss

This is a typical training loop

In [51]:
batch_size = 32
for steps in range(100):

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)  # zero out gradients from previous steps
    loss.backward()  # get the gradients for all the parameters
    optimizer.step() # use the gradients to update tour parameters

    print(loss.item())

3.513597011566162
3.4920475482940674
3.6158809661865234
3.4778759479522705
3.4746806621551514
3.476297378540039
3.5685088634490967
3.582345962524414
3.624885082244873
3.5399792194366455
3.455087661743164
3.505511999130249
3.452834129333496
3.4823296070098877
3.44356107711792
3.4874770641326904
3.500288724899292
3.4307103157043457
3.5101258754730225
3.5510165691375732
3.4752910137176514
3.451035976409912
3.476080894470215
3.503736734390259
3.5053741931915283
3.442706346511841
3.4427266120910645
3.5845072269439697
3.4452524185180664
3.5391461849212646
3.4656264781951904
3.45918869972229
3.4729905128479004
3.5081980228424072
3.4671852588653564
3.435931444168091
3.5770742893218994
3.4848225116729736
3.5730502605438232
3.528606653213501
3.5636637210845947
3.4763176441192627
3.4208364486694336
3.5142157077789307
3.4705793857574463
3.4796066284179688
3.5235705375671387
3.498183250427246
3.4660837650299072
3.501591205596924
3.4976627826690674
3.4792208671569824
3.450889825820923
3.381554365158

Note that the more times we run the training loop, the lower the loss becomes.

Let's increase the number of runs to 1000, and print at the end.

Note: this is pretty janky as far as optimization goes, but we're learning so....

Note 2: Keep running maybe with 10000 to continue reducing the loss

In [62]:
batch_size = 32
for steps in range(10000):

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)  # zero out gradients from previous steps
    loss.backward()  # get the gradients for all the parameters
    optimizer.step() # use the gradients to update tour parameters

print(loss.item())

2.453230619430542


Let's verify the post-training results (with more tokens for a longer response.)

What a big difference!!  Not quite Shakespeare yet, but maybe as good as a million monkeys on keyboards?!?  :)



In [64]:
idx = torch.zeros((1,1), dtype=torch.long) # creating a 1x1 tensor of zeros (remember 0 = new line)

# ask for 100 new tokens, generate, convert to list to feed into decode
print(decode(m.generate(idx, max_new_tokens=300)[0].tolist()))


A:
As ol I me!
Mos yast wered bad ale, a-d at entce bliselardosot dif, at candseeveathan h,
Bys t s!'The ny me thelie?
S:
US:

DY:

EO, l, os urtat anolit arietou bala ned IZAl w set ththigr me' werdele the ADI bst morcer hias? d, matoroue hmaso inge myonisstonen, be,
The we d, me beed weron
Fiurwat


Progress!!  BUT Recall that this is a very simple model.  The tokens aren't talking to each other...given the previous context of what was generated, we're only looking at the previous token to make a prediction as to what comes next.

We will take the output of this notebook, and move this to the bigram.py script.

### The Mathematical trick in self-attention

Pay close attention because this trick is at the heart of efficient self-attention

Below, the 8 tokens are currently not talking to each other.  We want to couple them, but in a very specific way.  
- We don't want a token talking to the tokens that come after (e.g. the 5th token talking to the 6th, 7th, or 8th) as those are future tokens that we are trying to predict.  
- We want the tokens to talk to the previous tokens for "conversational context".

The simplest way to "communicate" is to do an average of the previous tokens.  If I'm the 5th token, I take my channel plus the channels from the 4th, 3rd, etc tokens, and average them up.  This becomes a feature vector that summarizes me in the context of my history.  This is extremely lossy, but it's good enough for now.


In [66]:
# Consider this simple example
torch.manual_seed(1337)
B,T,C = 4, 8, 2 # B = batch size, T = time, C = channels
x = torch.randn(B, T, C) # random input
x.shape

torch.Size([4, 8, 2])

torch.Size([4, 8, 2])

Version 1

bag of words = a term used when we're averaging things

In [67]:
# We want x[b,t] = mean_{i<=t} x[b,i]
x_bag_of_words = torch.zeros((B, T, C))
for b in range(B): # Not efficient but we'll improve later
    for t in range(T):
        xprev = x[b, :t+1] # (t, C) - basically chunk of previous tokens
        x_bag_of_words[b, t] = torch.mean(xprev, 0) # (C,) - mean over time


In [68]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [70]:
# each element is the average of all the previous
x_bag_of_words[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

This is inefficient.  The trick is to use matrix multiplication.

Row of A * column of B = C.  

Number of c repeats because A is all 1s

In [75]:
torch.manual_seed(42)
#a = torch.ones(3,3)
#a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True) # this is the same as dividing by rowsum....rows average 1
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('----')
print('a=')
print(a)
print('----')
print('b=')
print(b)
print('----')
print('c=')
print(c)
print('----')

----
a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
----
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
----
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
----


Version 2 - Matrix Multiply

In [77]:
# Version 2
weights = torch.tril(torch.ones(T, T))
weights = weights / weights.sum(1, keepdim=True)
x_bag_of_words2 = weights @ x # (B, T, T) @ (B, T, C) -> (B, T, C) 

# xbow2 will be identical to xbox
torch.allclose(x_bag_of_words, x_bag_of_words2)


True

In [None]:
torch.tril

Version 3 - Softmax