In [3]:
# Load the tiny shakespeare dataset

with open('./datasets/tiny-shakespeare/input.txt','r',encoding='utf-8') as f:
    text = f.read()

In [4]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [5]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
# all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


Character level language model. Encode individual character into integers. 

In [11]:
# Tokenizing at character level 
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers 
decode = lambda l: ''.join(itos[i] for i in l) # decoder: take a list of integers, output a string 


print(encode("Hii there"))
print(decode(encode("Hii there")))

[20, 47, 47, 1, 58, 46, 43, 56, 43]
Hii there


In [12]:
import torch
data = torch.tensor(encode(text),dtype = torch.long)

print(data.shape, data.dtype)

print(data[:1000]) # the first 1000 chars look like this 

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [13]:
# split into train and val sets 
n = int(0.9 * len(data)) # 90% training 
train_data = data[:n]
val_data = data[n:]

Train the transformer not with the whole dataset at once, but with random chunks of the data. Theses chunks would have some maximum length (block_size) or context_length


In [14]:
block_size = 8

train_data[:block_size +1]

# the first 9 characters in the training set, this has multiple examples packed into it,
# because all of these characters follow each other 

# when we input this to a transformer, we are going to simulataneously train it at every one of these positions
# For a chunk of 9 chars, theres 8 individual examples. This also helps the transformer learn of examples with 
# context as little as 1, all the way upto block size 

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [16]:
# Print the 8 examples in a chunk of 9 characters 

x = train_data[:block_size] # inputs to the transformer
y = train_data[1:block_size + 1] # next block size characters, offset by 1

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is \"{target}\"")


when input is tensor([18]) the target is "47"
when input is tensor([18, 47]) the target is "56"
when input is tensor([18, 47, 56]) the target is "57"
when input is tensor([18, 47, 56, 57]) the target is "58"
when input is tensor([18, 47, 56, 57, 58]) the target is "1"
when input is tensor([18, 47, 56, 57, 58,  1]) the target is "15"
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is "47"
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is "58"


In [20]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    # grab (batch_size) number of random offsets generated between 
    # 0 and len(data) - block_size 
    ix = torch.randint(len(data) - block_size, (batch_size,)) 
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53

# Start with the simplest language model: bigram model

In [33]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets = None):
        # idx and targets are both (B,T) tensors of integers
        
        logits = self.token_embedding_table(idx) # (B,T,C) : # batch =4  x time = 8 x channel = vocab_size
        
        if targets == None: # in the case of generating 
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C) # cross_entropy expects in B*T,C 
            targets = targets.view(B*T) # do the same for targets 
            loss  = F.cross_entropy(logits,targets) # negative log-likelihood loss 
            
        return logits,loss 
    
    
    def generate (self, idx, max_new_tokens):
        # idx is (B,T) array of indices in the crrent contxxt 
        for _ in range(max_new_tokens):
            logits, loss = self(idx) # get the logits 
            # focus on the last time step only (because it's bigram)
            logits = logits[:,-1,:] # is now B x C
            probs = F.softmax(logits, dim = 1) # Get probabilities
            # sample from the dist
            idx_next = torch.multinomial(probs, num_samples  = 1) # B x 1
            # append the sampled index to the running sequence 
            idx = torch.cat((idx, idx_next), dim = 1)#  B , T+1
        return idx
    

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype = torch.long) # 1x1 char, the \n char, to kick off the generation
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [34]:
optimzer = torch.optim.AdamW(m.parameters(),lr =1e-3)

In [42]:
batch_size = 32

for steps in range(10000):
    xb,yb = get_batch('train')
    
    logits,loss = m(xb,yb)
    optimzer.zero_grad(set_to_none = True)
    loss.backward()
    optimzer.step()
    
print(loss.item())

2.576992988586426


In [46]:
print(decode(m.generate(idx, max_new_tokens=500)[0].tolist()))



PSh here anthe t prosesthanha m well,
F ticleasuolur d
T:
pesomo owst pugino d

ARif w k ithore th, Roue s ped tha okifok, de
Ar.

HESerer ama hatar&Plyour vet hr with is aras Tht; thap
YORENGind hathintheth miouro oulof I and s l SS:
Iffof; hat t-asoresere ashath fover;

AUMENGHave ie? nds to wisus, athal
Fiotha her owa
Fouidif toury aris dor yoress ane hit in,
O:
LETAUns.
Isat t ust far thas s sthasers t Bokerdace
My e
TENIris,
G oue, hon buime.
adive momo, warawoofe, M: atre deseeshen tar me


# The mathematical trick for self-attention

We want the tokens to communicate in a very specific way. The token in the 5th location should not communicate 
with the token in the 6th 7th and 8th location because those are future tokens. 

Information only flows from the previous context to the current timestep. one way to capture this information would be to average across the previous tokens, in the channel dimension, along with the current timestep.

In [48]:
# Toy example 
torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [49]:
# Version 1: Using loops (very slow)

# we want x[b,t] = mean_{i<=t} x[b,i]

xbow = torch.zeros((B,T,C)) #bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0) # average out the time jjjjkk

In [53]:
print(f"X at 0th batch:\n {x[0]}\n Xbow at 0th batch \n{xbow[0]}")

X at 0th batch:
 tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])
 Xbow at 0th batch 
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


We observe, at the first row, the two are equal. That's because it's the average of that one token. The second row is an average of the first two rows (0.1808 + -0.3596) / 2 = 0.1808. The third row is the average of the first tree tokens, and so on. The last row is the average of all elements (vertical average) 

tril will generate a matrix that contains one only in the lower half. Multiplying matrix wise, this actually gives the average, since a is normalized before matrix multiplication. Thus we get the same result as before 

In [62]:
# Version 2: Can get very effiecient with matrix multiplication 
# torch.manual_seed(42)
# a = torch.tril(torch.ones(3,3))
# a = a/ torch.sum(a,1,keepdim = True)
# b = torch.randint(0,10,(3,2)).float()

# c = a@b
# print(f"a: \n{a}\n b: \n{b}\n c: \n{c}")

print(wei)
wei = torch.tril(torch.ones(T,T)) # weights 
wei = wei/ wei.sum(1, keepdim = True)
xbow2 = wei @ x # (T,T) @ (B,T,C) ---> Pytorch will create a batch dimension
# So it will become (B,T,T) @ (B,T,C) --> (B,T,C)

# we get the same result as before, but much faster 
print(torch.allclose(xbow,xbow2)) 

print(xbow[0],"\n", xbow2[0])

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
True
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]]) 
 tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
       

In [65]:
# Version 3: using softmax
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # for all the elements where tril == 0 , replace them with -inf
# so when we do softmax on those elements, it becomes zero. and softmax is going to normalzie them as well
wei = F.softmax(wei, dim = 1)

print(wei)

xbow3 = wei @ x 
torch.allclose(xbow,xbow3)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

The weights are an interaction strength, or an affinity. Telling us how much of each token from the past do we want to aggregate and average up. Making the values in the future to -inf basically clamps them so that they cannot communicate, as we don't want to get information from the future tokens.

The affinity are set 0 for this example, but in the case of the transformer, they are going to be data dependent and the tokens will start looking at each other. Some tokens will find other tokens more or less interesting.

Long story short: can do weighted aggregations of the past elements by using matrix multiplication of a lower triangular fashion.