# GPT from Scratch

From Karpathy https://www.youtube.com/watch?v=kCc8FmEb1nY
The aim is to build a character level nano-GPT model (not word level) since its simpler

In [3]:
## Download the tiny shakespeare dataset from this URL
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [6]:
input_path = '../../../data/tinyshakespeare/input.txt'

In [7]:
import os
os.path.exists(input_path)

True

In [9]:
with open(input_path, 'r', encoding='utf-8') as fp:
    text = fp.read()
    

In [10]:
print(f'The text has {len(text)} characters')

The text has 1115394 characters


In [14]:
## Let's get the unique characters and vocab. Treats upper case and lower case differently.
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [16]:
## Tokenization : Since its a character level language model, tokenization is at character level
## Convert vocab to set of indices
stoi = { ch:i for i,ch in enumerate(chars) } ## st to index
itos = {i:ch for i,ch in enumerate(chars)}

In [20]:
## Define encode (takes a string , gives a list of integers) and decode (takes list of integers, gives a string) lambda functions
encode = lambda s : [stoi[x] for x in s]
decode = lambda l : "".join([itos[x] for x in l])


In [21]:
print(encode("hi there"))

[46, 47, 1, 58, 46, 43, 56, 43]


In [22]:
print(decode([46, 47, 1, 58, 46, 43, 56, 43]))

hi there


## Encode the whole corpus, and convert to tensor

In [23]:
import torch
data = torch.tensor(encode(text), dtype=torch.long) ## torch.long is 64 bit integer

In [25]:
data.dtype

torch.int64

In [26]:
data[0:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

## Train validation split

In [27]:
n = int(0.9*len(data)) ## First 90% train, 10% validation. Since its a language model we have to keep continuous chunks

In [28]:
n

1003854

In [29]:
train_data = data[0:n]
val_data = data[n:]

In [31]:
block_size = 8 ## length of each chunk fed to the NN, not too large, is it ? (context window)

In [32]:
x = train_data[:block_size] ## Initial input
y = train_data[1:block_size + 1] ## target (offset by 1 from input)

for t in range(block_size):
    context = x[:t+1] ## Why t+1 and not t ? If you work out the details it will come out
    target = y[t]
    print(f"When input is {context}, target is {target}")

When input is tensor([18]), target is 47
When input is tensor([18, 47]), target is 56
When input is tensor([18, 47, 56]), target is 57
When input is tensor([18, 47, 56, 57]), target is 58
When input is tensor([18, 47, 56, 57, 58]), target is 1
When input is tensor([18, 47, 56, 57, 58,  1]), target is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is 58


In [34]:
## So we get 8 training examples from the first block of 9 characters  - 8 inputs + 1 target

## Now we want to do batching

In [35]:
torch.manual_seed(1337)
batch_size = 4

In [46]:
def get_batch(split):
    data =  train_data if split=='train' else val_data
    ix = torch.randint(0,len(data)-block_size, (batch_size,)) ## picks batch_size no random integers between low and high
    x = torch.stack([data[i:i+block_size] for i in ix], dim=0) ## row concatenation
    y = torch.stack([data[i+1:i+block_size+1] for i in ix], dim=0) ## row concatenation
    
    return x,y
    
    

In [49]:
xb, yb = get_batch('train')
print("inputs: ")
print(xb.shape)
print(xb)
print("targets: ")
print(yb.shape)
print(yb)

inputs: 
torch.Size([4, 8])
tensor([[52, 57,  8,  0, 21,  5, 50, 50],
        [47, 58, 46,  1, 63, 53, 59,  6],
        [ 1, 53, 40, 43, 63, 12,  0, 26],
        [50, 42, 57, 58,  6,  0, 32, 46]])
targets: 
torch.Size([4, 8])
tensor([[57,  8,  0, 21,  5, 50, 50,  1],
        [58, 46,  1, 63, 53, 59,  6,  1],
        [53, 40, 43, 63, 12,  0, 26, 39],
        [42, 57, 58,  6,  0, 32, 46, 43]])


In [None]:
## This 4*8 array contains 32 examples which are independent for transformer

In [51]:
for batch in range(batch_size):
    for time in range(block_size):
        context = xb[batch,:time+1]
        target = yb[batch,time]
        print(f"When input is {context}, target is {target}")
        
        

When input is tensor([52]), target is 57
When input is tensor([52, 57]), target is 8
When input is tensor([52, 57,  8]), target is 0
When input is tensor([52, 57,  8,  0]), target is 21
When input is tensor([52, 57,  8,  0, 21]), target is 5
When input is tensor([52, 57,  8,  0, 21,  5]), target is 50
When input is tensor([52, 57,  8,  0, 21,  5, 50]), target is 50
When input is tensor([52, 57,  8,  0, 21,  5, 50, 50]), target is 1
When input is tensor([47]), target is 58
When input is tensor([47, 58]), target is 46
When input is tensor([47, 58, 46]), target is 1
When input is tensor([47, 58, 46,  1]), target is 63
When input is tensor([47, 58, 46,  1, 63]), target is 53
When input is tensor([47, 58, 46,  1, 63, 53]), target is 59
When input is tensor([47, 58, 46,  1, 63, 53, 59]), target is 6
When input is tensor([47, 58, 46,  1, 63, 53, 59,  6]), target is 1
When input is tensor([1]), target is 53
When input is tensor([ 1, 53]), target is 40
When input is tensor([ 1, 53, 40]), target

## Bigram language model

In [53]:
import torch.nn as nn
from torch.nn import functional as F

In [81]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off logits of next token using a lookup
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) ## interestingly, the embedding dimension is set to the vocab size itself ?
        
    def forward(self, idx, targets=None):
        
        ## idx and targets are both B, T tensor of integers (batch size, window size)
        logits = self.token_embedding_table(idx) ## dimension B,T,C -> C is no of channels, which is vocab size here
        ## Logits are just the predictions ! We want to compute cross entropy loss between predictions and targets using F.cross_entropy
        ## But hold on : F.cross_entropy expects in form B*T, C
        ## Therefore reshape !!
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
        
        
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
        
        
    def generate(self, idx, max_new_tokens ):
        # idx is B,T array
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            # focus only on last time step
            logits = logits[:,-1,:] ## Becomes B,C
            
            ## softmax to get probabilities
            probs = F.softmax(logits, dim=1) ## Dimension applied across C,B*T*C ->  B*C dimension
            
            ## sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1) ## B*1 dimension
            
            ## append sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim=1 ) # B*T+1
        return idx
            
            
            
            
        
    
    

In [82]:
m = BigramLanguageModel(vocab_size)

In [83]:
logits, loss = m(xb, yb)

In [84]:
loss

tensor(4.8720, grad_fn=<NllLossBackward0>)

In [80]:
logits.shape

torch.Size([32, 65])

In [85]:
xb.shape

torch.Size([4, 8])

In [88]:
## let's now generate using this
start_input = torch.zeros([1,1], dtype = torch.long) ## like a batch of 1 sample, of 1 index. Why 0 ? because if you look above, the index 0 represents /n which is a good starting point


In [96]:
decode(list(m.generate(idx = start_input, max_new_tokens= 100 )[0].tolist()))

'\n&rK?yNeJDnb\nQvwVQKR:YQgca.sp,xagkkT,mw:gNoKnRlEza!Quwkgd&RZYHVoXYQ&uLbMJBrlyUdrYXb y -cPe.JfxOVKtdoA'

In [None]:
## Rubbish as expected because the model is not trained yet

## Training the bigram model

In [98]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [105]:
batch_size = 32

for steps in range(10000):
    
    xb, yb = get_batch("train") ## sample a batch of data
    
    ## evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    
    loss.backward()
    optimizer.step()
print(loss.item())
    
    
    

2.4409520626068115


In [106]:
## after training, lets just check how well the bigram model does
decode(list(m.generate(idx = start_input, max_new_tokens= 100 )[0].tolist()))

'\nIIZEcon yofy orirl I imondareyo yoovethoves four it burengonond\nWere ante bencusamy wink houge serst'

slightly better !! But since it looks only at one token to predict the next one , it can only do so much

## self attention - mathematical trick

In [107]:
torch.manual_seed(1337)
B,T,C = 4,8,2 ## batch size, sequence size, channel/embedding size
x = torch.randn(B,T,C)

In [109]:
x.shape

torch.Size([4, 8, 2])

We want to couple each of the 8 positions with all other current and past positions using self-attention, not future

How do we do this ?

Let's take a simple coupling mechanism - every token communicates with all current and past tokens by taking an average of the embeddings of all current and past embeddings



Implement by 2 for loops - this is not an efficient implementation

In [111]:
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] ## of dimension (t,C)
        xbow[b,t] = torch.mean(xprev, dim=0) ## C*

        

In [112]:
Implement using matrix multiplication - efficient multiplication

torch.Size([8, 2])

Given a matrix



In [120]:
torch.manual_seed(42)
b = torch.randint(0,10, (3,2)).float()

In [121]:
b

tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])

Lets say I want to create a matrix where every row is the average of all rows until that point

ie - row 1  is [2,7] as is, row 2 is average of [2,7] and [6,4] and row 3 is average of [2,7], [6,4] and [6,5]

So get a matrix with same dimension as b, such that each row is the average of all rows till that point
How do you implement this using matrix mulplication ?



Create a matrix a of dimensiton 3,3 (ie with dimension such that a @ b will have the dimension of b which is what we want )

In [122]:
a = torch.ones(3,3)

In [123]:
a

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])

In [125]:
c = a@b

c has each row as the sum across rows of corresponding column in b , as expected

This means each row in b is attending to all other rows in b, not only rows in current and past positions 

In [None]:
Instead of matrix of ones for a, create a lower triangular matrix

In [135]:
a = torch.tril(torch.ones(3,3))

In [136]:
a

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [130]:
c = a @ b

In [131]:
print(b)
print(c)

tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


In [132]:
## Now you see that every row in c contains sums of all rows in b till that row, so it only attends to current and past positions !!

Now we just need to convert the sum to an average
to do this, we normalize a, the triangular matrix such that sum of each row is 1 (sum across columns)

In [143]:
a = a / a.sum(dim=1, keepdim=True)## note that keepdim=True is important

In [144]:
a

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

In [145]:
c = a@b

In [146]:
c

tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])

In [147]:
b

tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])

Voila !! c now has averages of rows in b till that index by matrix multiplication

Going back to our x, and doing the same thing

In [148]:
wei = torch.tril(torch.ones(T,T))

In [150]:
wei = wei/wei.sum(axis=1, keepdim=True)

In [151]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [152]:
## This is the weighting tensor to ensure that each position attends to only current and past positions, with the weight 

In [154]:
xbow2 = wei @ x  ## wei - T*T, x - B*T*C. Doesn't exactly match, pytorch will create an additional batch element in wei on its own
                 ## so finally will be B*T*C

In [156]:
torch.allclose(xbow, xbow2)

False

Here's a 3rd equivalent implementation using softmax

In [160]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei = wei.masked_fill(tril==0, float("-inf") )

In [161]:
wei

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [162]:
## create a matrix of 0's, with -inf for future elements . In the real implementation of self-attention
## the futures will be -inf, but instead of 0's, you will will get floats of affinities from the query, key

In [164]:
wei = torch.softmax(wei, dim=1) ## This is equivalent to what we did above with torch.mean in implementation 2 !!
print(wei)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [165]:
xbow3 = wei @ x

## Let's now implement regular self-attention as a way for a token to get information from all other tokens  (as described in the paper attention is all you need)


In [170]:
B,T,C = 4,8,32 ## Now we're saying the channel or embedding size is 32
x = torch.randn(B,T,C)

## Define a single head, with head size = 16
head_size = 16

## Now, given input x, you get key and query and value from x (1 per token for every batch)
key = nn.Linear(C, head_size, bias=False) ## Input is B*T*C, output is B*T*head_size
query = nn.Linear(C, head_size, bias=False) ## Input is B*T*C, output is B*T*head_size
value = nn.Linear(C, head_size, bias=False) ## Input is B*T*C, output is B*T*head_size

k = key(x) ## size is  B*T*head_size
q = query(x) ## size is B*T*head_size
v = value(x) ## size is B*T*head_size

## Note that this is self-attention (keys, queries and values from from same source x)
## For cross-attention ( machine translation problem where its not self-attention), the query comes from decoder 
## and key and value come from encoder
## Given key and query for every token, we compute a T*T attention matrix for each batch (communication between tokens)
wei = q @ k.transpose(-2,-1)  ## q is B*T*C, you want k to be transposed such that dimension becomes B*C*T to get B*T*T, so transpose second and third dimension of k for every batch ie last 2 dimensions
wei = wei * (head_size)**(-0.5) ## Normalizing by sq root of head size
## Now wei is B*T*T
## For decoder architecture, we have to apply masking just like before

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril==0, float("-inf") ) ## For a encoder architecture, this step will not be there

## Now we apply softmax row wise (across columns). Now we have the normalized attention weights for every batch
wei = F.softmax(wei, dim=1) ## note that wei is of course not symmetric because of multiple reasons - query - key generated differently, masking future tokens, softmax across columns. So A->B influence is not the same as B->A influence 

## Now we multiple value with the attention weights
out = wei @ v ## B*T*T * B*T*head_size = B*T*head_size


In [204]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

1) Attention is a communication mechanism (directed) between tokens
2) In a decoder architecture, the first node just communicates with itself, second node with itself and first node, and so on
3) There is no inherent notion of space, so we need positional encodings added to character/word embeddings to get x
4) No interaction across different samples of a given batch, only within each sample
5) Encoder architectures won't have future masking - So just delete one line above (the masked_fill)
6) This is self-attention where queries, keys and values come from same source. For cross-attention, query comes from one source, keys and values from another (for example machine translation)
7) In the Attention is all you need paper, the dot product between query and key is divided by sq root of head size . Why ?

In [172]:
## Assume key and query are unit normal
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
w1 = k @ q.transpose(-2,-1) ## Without normalizing by sq root of head
w2 = k @ q.transpose(-2,-1) * (head_size**(-0.5)) ## With normalizing by sq root of head

In [173]:
k.var()

tensor(0.9279)

In [174]:
q.var()

tensor(1.0447)

In [175]:
w1.var()

tensor(15.9491)

In [176]:
w2.var()

tensor(0.9968)

Note above, that if w1 has variance much higher than k and q, whereas w2 is comparable

Why is this important ? If the variance is higher, taking a softmax skews/sharpens the output towards a single value, restricting information flow

In [203]:
print(torch.softmax(w1[0][0].reshape(1,8), dim=1)) ## sharpens element 2
print(torch.softmax(w2[0][0].reshape(1,8), dim=1)) ## less sharper

tensor([[0.0548, 0.5621, 0.2015, 0.0012, 0.1149, 0.0012, 0.0151, 0.0491]])
tensor([[0.1274, 0.2281, 0.1765, 0.0490, 0.1534, 0.0493, 0.0924, 0.1240]])
