In [1]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print('length of dataset: ', len(text))

length of dataset:  1115394


In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


Mapping from characters to integers

In [4]:
# Each character is converted to an integer, so there is a character level tokenizer. 
# we get long sequences

stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode('hello'))
print(decode(encode('hello')))

[46, 43, 50, 50, 53]
hello


In [5]:
import torch
import torch.nn as nn
from torch.nn import functional as F

data = torch.tensor(encode(text), dtype=torch.long)

print(data.shape)

torch.Size([1115394])


In [6]:
# separate the data to train and val split
n = int(0.9*len(data))
train = data[:n]
val = data[n:]

In [7]:
# The transformers are auto regressive, hence the output, once again becomes an input and will be used for prediction

# we consider block size to be 8

block_size = 8

x = train[:block_size]
# Offsetting y by 1, so that always a set of characters predict the next set in an auto regressive fashion
y = train[1:block_size+1]

for t in range(block_size):
    print(f'when input is {x[:t+1]}, output is {y[t]}')


when input is tensor([18]), output is 47
when input is tensor([18, 47]), output is 56
when input is tensor([18, 47, 56]), output is 57
when input is tensor([18, 47, 56, 57]), output is 58
when input is tensor([18, 47, 56, 57, 58]), output is 1
when input is tensor([18, 47, 56, 57, 58,  1]), output is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), output is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), output is 58


In [8]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train if split == 'train' else val
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(f'Shape, xb: {xb.shape}, yb: {yb.shape}')
print(xb)
print(yb)

Shape, xb: torch.Size([4, 8]), yb: torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


Using the Bigram Language Model below, we are creating a representation of each token that is generated above. 

The method that we use to do this, is create an embedding table, keep looking it up based on the index of the token. For example, token 1 will look up the 1st index

In [9]:
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size) -> None:
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            # Changing the shape so that the cross entropy function can understand
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            # Implement the negative log likelihood
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # Predictions for a given index
            logits, _ = self(idx)

            # We only use the last time step
            logits = logits[:, -1, :] # dimension is B, C
            probs = F.softmax(logits, dim=-1)

            # sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1) # B, 1

            # append sample index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)

        return idx



m = BigramLanguageModel(vocab_size)
out, loss = m(xb, yb)

print(out.shape)
print(loss)

idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [10]:
# Create a pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [11]:
batch_size = 32
num_steps = 25000

for steps in range(num_steps):

    # sample batch 
    xb, yb = get_batch('train')

    # evaluate loss
    logits, loss = m(xb, yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4316647052764893


In [12]:
idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=10000)[0].tolist()))


Cl, IZEEDUENGonthe t l me's angice;
Thewareratulyendutonde 's flie,
fr arnd ic-mybem t m'dsent ee, blouged fere fearau se fofouriey'd!
LANEThellld ogo withend tequ
Y ldill, totooousothive wdd nctonasngin mm se mpll,
WA st?

Whins t'eng t rere t inchyig y;
Y:
S:
RDYOn h cas re be I it? meswed,
Noun
Thou that alizathive ly

Premo Twe HAus hyon,
Hep s taldr g d wh h ju pr gurkeortwhin Whakil te d;
Berenu ad-priou y!
Buerstoo, engin, anifiterit p,
isthey thopis'd
Bu?
He thilere apre thior ane; ofar pls swhengity, dyoloyororhes
Cisthe is thirnon f bapsard bo whithatheam athik ousend d deeroleed! s win pehin, wouers, IORDORY:
Orkitonce

Ort ce
O:
ffit herom mem, f bourothalurerug wnrourer; a ho os thay a ke we be yod man te d lokngheheare te st: I,
She norofrean is:
Andore icard appilant ds the parexier ofl tor l.

A ousamame s Bed:
aurso we our su'st son,
Tha IVERDe haru tlarof, in a I ft wn,
He, bads be the.
Am om w iofuruekerth,
THad ING d:
Munt, ce wan AGLAupsh! fe owe therethind, ay m:

Mathematical trick for self attention

We want the tokens in a word or sentence to communicate with each other, also a token should not communicate with the future, as future needs to be predicted, it should only communicate with the past.

A simple way of doing this, is taking the average of all the tokens preceeding the present token. This may not be efficient but, can be used as a starting point. 

In [13]:
torch.manual_seed(1337)

B, T, C = 4, 8, 2
x = torch.randn(B, T, C)

# Bag of words
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] 
        xbow[b, t] = torch.mean(xprev, 0)




In [14]:
# The above approach is very inefficient, but can be done using triangular matrix multiplication
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x

print(xbow2.shape)

torch.Size([4, 8, 2])


In [15]:
# The above approach can also be made better and can provide some intuitive understanding for self attention

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
# Filling all the places where tril is zero to -inf
wei = wei.masked_fill(tril == 0, float('-inf'))
# Taking softmax for weights
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x

In [16]:
T = 8

torch.arange(T)

tensor([0, 1, 2, 3, 4, 5, 6, 7])

In [23]:
torch.manual_seed(1337)

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Single head to perform self attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, head_size)
q = query(x) # B, T, head_size)


wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) --> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros(T, T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x) # (B, T, head_size)

out = wei @ v # (B, T, head_size)

out.shape

torch.Size([4, 8, 16])

In [24]:
# Now wei has weights which will tell how influenced is the token when compared to the previous tokens

# In encoder block, which is more about understanding, we would keep all the blocks. That means we will not be using tril
# In decoder block, where we keep generating in the auto regressive fashion, we would not require the blocks to talk to the future blocks. Here we need to use trill

wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

Self Attention - All keys, queries and values come from the same source

Cross Attention - The keys, values come from decoder, and queries come from encoder

Scaled Dot product attention - q, k has unit variance, but the wei has a variance of head_size. Hence to make it unit variance, we divide by sqrt(head_size)

In [25]:
384/64

6.0