In [2]:
import torch
import random
import torch.nn.functional as F
import torch.nn as nn

In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [9]:
print(f'len of input text: {len(text)}')
print(text[:400])

len of input text: 1115393
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it 


In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [4]:
# tokenize
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for ch, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s]   # encoder: take a string and output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder = take a list of integers, output a string

print(encode("hello"))
print(decode(encode('hi')))

[46, 43, 50, 50, 53]
hi


In [5]:
# encode the entire text dataset
data = torch.tensor(encode(text), dtype=torch.long)
print(data.dtype, data.shape)
print(data[:100])

torch.int64 torch.Size([1115393])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [6]:
# training and val splits: 90% training, rest val
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
print(len(train_data), len(val_data))

1003853 111540


In [None]:
# block_size = 8  # context length or chunk size, transformer will receive chars upto the block size 
#                 # to predict next char, above block_size chunking would be required
# x = train_data[:block_size]
# y = train_data[1:block_size+1]
# for t in range(block_size):
#     context = x[:t+1]
#     target = y[t]
#     print(f'when input is {context}, output is {target}')


when input is tensor([18]), output is 47
when input is tensor([18, 47]), output is 56
when input is tensor([18, 47, 56]), output is 57
when input is tensor([18, 47, 56, 57]), output is 58
when input is tensor([18, 47, 56, 57, 58]), output is 1
when input is tensor([18, 47, 56, 57, 58,  1]), output is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), output is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), output is 58


In [7]:
torch.manual_seed(1337)
batch_size = 4  # how many independent sequence will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix =  torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y= torch.stack([data[i+1: i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:', xb)
print(xb.shape)
print('outputs', yb)
print(yb.shape)
print('----')

for b in range(batch_size):   # batch dimension
    for t in range(block_size) : # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'input is {context}, output is {target}')


inputs: tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
torch.Size([4, 8])
outputs tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])
torch.Size([4, 8])
----
input is tensor([53]), output is 59
input is tensor([53, 59]), output is 6
input is tensor([53, 59,  6]), output is 1
input is tensor([53, 59,  6,  1]), output is 58
input is tensor([53, 59,  6,  1, 58]), output is 56
input is tensor([53, 59,  6,  1, 58, 56]), output is 47
input is tensor([53, 59,  6,  1, 58, 56, 47]), output is 40
input is tensor([53, 59,  6,  1, 58, 56, 47, 40]), output is 59
input is tensor([49]), output is 43
input is tensor([49, 43]), output is 43
input is tensor([49, 43, 43]), output is 54
input is tensor([49, 43, 43, 54]), output is 1
input is tensor([49, 43, 43, 54,  1]),

In [10]:
print(xb)   # input to the transformer

tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])


**bigram language model**

In [33]:
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size)

    def forward(self,idx,targets=None):
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx)  # ( B, T, C) arranged as batch, time, channel i.e (4, 8 65) here
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  # https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_tokens):
        # idx is (B, T) array of indices in current context
        for _ in range(max_tokens):
            # get the predictions
            logits, loss = self(idx)        
            # focus only on the last time step
            logits = logits[:,-1,:] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) #(B, 1)
            # append sampled idx to running sequence
            idx = torch.cat((idx,idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
out = decode(m.generate(idx, max_tokens=100)[0].tolist())  # m.generate output shape here is 1,101
print(out)
# expected loss is ln(1/65) = 4.17

torch.Size([256, 65])
tensor(4.6288, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [36]:
# create a pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(),lr = 1e-3)

In [37]:
batch_size = 32
for i in range(10000):

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss  = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.440803050994873


In [38]:
print(decode(m.generate(idx= torch.zeros((1,1), dtype=torch.long),max_tokens=400)[0].tolist()))


NICOndu t w ity merwo al LOLo bebte loolld worinero ya l aknge ond thal ttry b's mo ge ck.

gh, cheetilllin trewnutud t arsu y;
Desthap's Zimponcrdistherdrtes saure ' erpoperrposthel?
Handis of hef thep: ct
Ywit harfoul'st, ar izlor t ct.
Fo, sther:
I d tre th,-ben.

HBltothedlucartee t the t,
STEMANGENTIV:
WDUKI HANENEThe d ndean-bros g qpl mout fok yolaime do myo asto,
Mok h$ay t nch sle fionhou


*mathematical trick in self attention*

In [2]:
# toy example
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
print(x.shape)

torch.Size([4, 8, 2])


In [12]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [None]:
(0.1808-0.3596)/2

# we want to the T (here 8) tokens to communicate in a batch B i.e. we want to couple them
# here eg. if I am a 5th token and want to communicate with the past then the simplest way is to take the avg of the preceding elements i.e. take the Channels (C) from my vector and all preceding vectors and avg them and get a new feature vector that summarizes me in the context of my history. it is an extremely weak form of communication and we have lost a lot of info
# we want x[b,t] = mean_{i<=t} x[b,i]

-0.0894

version 1: for loop

In [None]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        xbow[b,t] = torch.mean(xprev,0)
print(xbow[0])

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


version 2: using mat mul

In [19]:
torch.manual_seed(1337)
a = torch.tril(torch.ones(3,3))
a = a/torch.sum(a, dim=1, keepdim = True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print('a = ')
print(a)
print('b = ')
print(b)
print('c = ')
print(c)

a = 
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b = 
tensor([[5., 7.],
        [2., 0.],
        [5., 3.]])
c = 
tensor([[5.0000, 7.0000],
        [3.5000, 3.5000],
        [4.0000, 3.3333]])


In [33]:
# let's check it on the toy example
wei = torch.tril(torch.ones((T, T)))
wei = wei/wei.sum(1, keepdim=True)
xbow2 = wei @ x  # (B, T, T) @ (B, T, C) ----> (B, T, C)  (wei is T, T so batch dimension B is inserted by pytorch (hence B, T, T) and the mul happens for each batch)
torch.allclose(xbow, xbow2, atol=1e-6)

True

version 3: using softmax

In [35]:
tril = torch.tril(torch.ones((T, T)))
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [36]:
wei = torch.zeros((T,T))
wei

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [37]:
wei = wei.masked_fill(tril==0, float('-inf'))
wei

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T)) # initialize the affinities between all the tokens
wei = wei.masked_fill(tril == 0, float('-inf'))  # to only use the past context and avoid the future char in the context
wei = F.softmax(wei, dim=-1)  # normalizes
xbow3 = wei @ x
print(torch.allclose(xbow, xbow3,atol=1e-6))
print(xbow3[0])


True
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


*Self-Attention*

In [43]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

We don't want the values to be uniform in wei, they should be data dependent i.e. different tokens will find different tokens more interesting. This is to be solved by self attention.
How does it solve it??
--> Every single token at each position emits 2 vectors - query(what am I looking for) and key(what do I contain). value (what I can communicate)
affinities = dot product between key and query

In [10]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn((B, T, C))

# let's see a single head perform self attention
head_size = 16

key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)    # (B, T, 16)
q = query(x)  # (B, T, 16)
v = value(x)  # (B, T, 16)
wei = q @ k.transpose(-2,-1)   # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T,T)) # initialize the affinities between all the tokens
wei = wei.masked_fill(tril == 0, float('-inf'))  # to only use the past context and avoid the future char in the context
wei = F.softmax(wei, dim=-1)  # normalizes
# out = wei @ x
out = wei @ v   # v is getting aggregated and not the raw x
print(wei[0])

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)


Notes:

- Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with tril, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [None]:
k = torch.randn(B, T, head_size)  # unit gaussian
q = torch.randn(B, T, head_size)  # unit gaussian
wei = q @ k.transpose(-2, -1)     # this is in the order of head_size

print(k.var())
print(q.var())
print(wei.var())

tensor(1.0449)
tensor(1.0700)
tensor(17.4690)


In [None]:
k = torch.randn(B, T, head_size)  # unit gaussian
q = torch.randn(B, T, head_size)  # unit gaussian
wei = q @ k.transpose(-2, -1) * head_size**-0.5     # scaled so this has unit variance too 

print(k.var())
print(q.var())
print(wei.var())

tensor(1.0966)
tensor(0.9416)
tensor(1.0065)


why is this important?

- Wei feeds into softmax so its important that especially at initialization wei be fairly diffused
- If wei takes on too large +ve or -ve numbers softmax will converge towards one hot vectors
- illustrated below

In [7]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [None]:
torch.softmax(torch.tensor([ 0.8000, -1.6000,  2.4000, -1.6000,  4.0000]), dim=-1)
# it will sharpen towards whatever number will be highest (here 4.000), so we don't want these values to be too 
# exteme especially at init otherwise the softmax will be way too peaky and it would be basically be aggregating info from one single node

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

layernorm


In [None]:
class LayerNorm1d: # (used to be BatchNorm1d) (layernorm is across rows instead of features/columns)

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean  (this used to be 0 as it was across columns/features in batchnorm)
    xvar = x.var(1, keepdim=True) # batch variance (this used to be 0 as it was across columns/features in batchnorm)
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [4]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [5]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))