In [4]:
# tiny shakespear dataset
!curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt --output input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  5 1089k    5 65536    0     0   165k      0  0:00:06 --:--:--  0:00:06  165k
100 1089k  100 1089k    0     0  2092k      0 --:--:-- --:--:-- --:--:-- 2090k


In [1]:
# reading
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
#length of text
print(len(text))
chars = sorted(list(set(text)))
print(chars)
vocab_size = len(chars)

1115394
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda i: ''.join([itos[d] for d in i])

print(encode("huzain"))

[46, 59, 64, 39, 47, 52]


In [7]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [13]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
block_size = 8
train_data = data[:n]
val_data = data[n:]

In [14]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [16]:
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [17]:
xb, yb = get_batch('train')

In [31]:
xb, yb

(tensor([[53, 56,  1, 46, 39, 57,  1, 52],
         [50,  1, 49, 43, 43, 54,  1, 39],
         [53, 56, 49,  8,  0, 37, 53, 52],
         [57,  1, 58, 46, 43,  1, 41, 46]]),
 tensor([[56,  1, 46, 39, 57,  1, 52, 53],
         [ 1, 49, 43, 43, 54,  1, 39, 58],
         [56, 49,  8,  0, 37, 53, 52, 42],
         [ 1, 58, 46, 43,  1, 41, 46, 39]]))

In [41]:
import torch 
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(786)

class BigramLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        # Batch, Time (context length), Channels(vocabsize)
        logits = self.token_embedding_table(idx) #BTC 4x8x65

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits,loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1) #(B,1)
            idx = torch.cat((idx, idx_next), dim=1) #(B,T+1)
        return idx

m = BigramLM(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

start = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(start, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.6336, grad_fn=<NllLossBackward0>)

ycE&Yr
 .!mMFVwHu saZIMay-kBB$?EPnxpWJO'lhtm bVQOq-omVLYrPRAYXJyznSvI!.RVyMiLzG;GRdfwbXQyUk-w.J-KDFM


In [42]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [48]:
batch_size = 32
for stpes in range(10000):
    xb, yb = get_batch('train')

    #get loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4078171253204346


In [50]:
print(decode(m.generate(start, max_new_tokens=400)[0].tolist()))


Cldro br fraven, ftlosstr ot ftornothardilot allm mefrs, u, char han sth nganext thinickicocenistetheatashert thm'd ba ghe istare he;
Bur m beeto, ou ccho Thenot eeaprn s Sid o!

I thel,
Tin d ur IO, HEayey prearow Vzcoure brede f owe the tharke,
Roug an my bathicanthtou
He hilas'tome d ppither taryal:
Hon homan l.
CERUFilly's,
Mayag r hisoraroudlft, te wimain whe sou ba, se tlighusird T:
S! d h t


## notes

```
Batch    
|     1 0 1 0 1 
|     0 0 0 1 0   
V  Time - - - >
```

value (1,0) is probability it is the current word out of the 65 words



### math trick for self attention

In [8]:
B,T,C = 4,8,2 #batch x time x channel
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

bag of words is averaging

In [9]:
#x[b,t] = mean(i<=t)

xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b,t] = torch.mean(xprev, 0)

In [12]:
x[0]

tensor([[-0.8013,  0.1877],
        [-2.7987,  1.8751],
        [-1.9090, -0.3588],
        [-0.8589, -0.1021],
        [-2.0260,  0.0910],
        [ 0.4628,  0.9515],
        [ 0.2417,  2.2452],
        [-0.2536,  2.2391]])

In [14]:
xbow.shape

torch.Size([4, 8, 2])

In [16]:
#tril is amazing
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1,keepdim=True)
xbow2 = wei @ x # (B,T,T) @ (B,T,C)


In [17]:
torch.allclose(xbow,xbow2)

True

In [4]:
tril = torch.tril(torch.ones(T,T))

In [24]:
import torch.nn.functional as F

wei= torch.zeros((T,T))
wei= wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

## self attention!

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

B,T,C = 4,8,32 #batch x time x channel
x = torch.randn(B,T,C)

# head
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k  = key(x)
q  = query(x)
wei = q @ k.transpose(-2,-1) * head_size * -0.5 # B T T

tril = torch.tril(torch.ones(T,T))
# wei= torch.zeros((T,T)) # parameterize?
wei= wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
out.shape

torch.Size([4, 8, 16])

In [8]:
wei[0]

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [3.1804e-05, 9.9997e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [2.5520e-02, 9.7448e-01, 9.2450e-10, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [3.4738e-02, 1.8744e-01, 5.7998e-01, 1.9784e-01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [9.9990e-01, 1.0148e-04, 1.3097e-16, 1.4714e-09, 2.5420e-06, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [1.2573e-13, 1.1571e-10, 1.0022e-09, 1.0000e+00, 3.3092e-11, 3.3209e-12,
         0.0000e+00, 0.0000e+00],
        [4.7221e-09, 7.8858e-01, 1.7604e-02, 1.5175e-04, 3.1473e-08, 1.8089e-01,
         1.2771e-02, 0.0000e+00],
        [9.6753e-01, 3.2299e-02, 3.6889e-12, 9.6806e-17, 1.2859e-06, 6.3773e-07,
         2.4619e-09, 1.6654e-04]], grad_fn=<SelectBackward0>)

## note
weights should be diffused hence we scale the attention head

softmax would make things peaky, scaling controls the variance

If it doesnt communicate with past, it is a decoder block