In [43]:
from pathlib import Path
import torch


## Data Preparation

In [44]:

input_file_path = Path('../data/tinyshakespeare.txt')

with open(input_file_path, 'r') as f:
    text = f.read()
print(f"length of dataset in characters: {len(text):,}")


length of dataset in characters: 1,115,393


#### get all the unique characters that occur in this text


In [45]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65


#### create a mapping from characters to integers


In [46]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(f"{encode("khoa") = }")
print(f"{decode(encode("khoa")) = }")

encode("khoa") = [49, 46, 53, 39]
decode(encode("khoa")) = 'khoa'


In [47]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)
print(data[:100])

torch.Size([1115393]) <built-in method type of Tensor object at 0x719248381db0>
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


So, we have a very small code book of 65 characters, very simple `encode` and `decode` functions, but we get very long sequences as a result

#### create the train and validation splits

In [48]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [49]:
print(f"{train_data.shape = }")
print(f"{val_data.shape = }")

train_data.shape = torch.Size([1003853])
val_data.shape = torch.Size([111540])


In [50]:
block_size = 8
train_data[:block_size]

tensor([18, 47, 56, 57, 58,  1, 15, 47])

In [51]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}, target = {target}")

when input is tensor([18]), target = 47
when input is tensor([18, 47]), target = 56
when input is tensor([18, 47, 56]), target = 57
when input is tensor([18, 47, 56, 57]), target = 58
when input is tensor([18, 47, 56, 57, 58]), target = 1
when input is tensor([18, 47, 56, 57, 58,  1]), target = 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), target = 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target = 58


> **Training Notes**:  
>
> We will train on all 8 input examples with the context of 1 character upto 8 characters. This is not just because of computational reasons, but also to make our Transformer get used to see inputs of different sizes (upto block size characters).  
>
> We will stack many batches of multiple chunks of text in a single torch `Tensor`, so we can keep the GPU busy since it is very good at parallel processing of data. These chunks will be processed independently in a parallel manner

In [52]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # randomizing the training data 
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y


print('---- getting the first batch of data ----')
xb, yb = get_batch('train')

print('inputs:')
print(f"{xb.shape = }")
print(f"{xb = }")
print()

print('targets:')
print(f"{yb.shape = }")
print(f"{yb = }")
print()

print(f'---- there are {batch_size*block_size} training examples here ----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

---- getting the first batch of data ----
inputs:
xb.shape = torch.Size([4, 8])
xb = tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])

targets:
yb.shape = torch.Size([4, 8])
yb = tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])

---- there are 32 training examples here ----
when input is [53] the target: 59
when input is [53, 59] the target: 6
when input is [53, 59, 6] the target: 1
when input is [53, 59, 6, 1] the target: 58
when input is [53, 59, 6, 1, 58] the target: 56
when input is [53, 59, 6, 1, 58, 56] the target: 47
when input is [53, 59, 6, 1, 58, 56, 47] the target: 40
when input is [53, 59, 6, 1, 58, 56, 47, 40] the target: 59
when input is [49] the target: 43
when input is [49, 43] the target: 43
when input is [49, 43, 43] the target: 54


## Simplest baseline: bigram language model, loss, generation

In [53]:
import torch
import torch.nn as nn
from torch.nn import functional as F


class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None) -> tuple[torch.Tensor, torch.Tensor | None]:
        logits = self.token_embedding_table(idx)  # (batch, time, channel)
        # F.cross_entropy expects logits.shape to be [batch_size, num_classes] 
        # and targets to be of shape [batch_size],
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  # e.g. [32, 65]
            targets = targets.view(B*T)  # e.g. [32]
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the loss and predictions
            logits, loss = self(idx)
            # focus only on the last time step in the logits
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    
m = BigramLanguageModel(vocab_size)
out = m(xb, yb)
out

(tensor([[-0.3724, -0.2800, -0.0915,  ...,  0.8662, -0.9581, -0.9197],
         [-1.2796,  0.3641, -0.8859,  ...,  0.5602,  0.6467,  0.6577],
         [ 0.4138, -1.4386,  1.2962,  ...,  1.6742, -0.2397,  0.3415],
         ...,
         [-1.2796,  0.3641, -0.8859,  ...,  0.5602,  0.6467,  0.6577],
         [ 0.7763, -0.8460,  0.8437,  ..., -1.0367, -1.2909,  1.1822],
         [ 0.3418, -0.9276,  1.2381,  ...,  1.5018, -0.5266,  0.2354]],
        grad_fn=<ViewBackward0>),
 tensor(4.9456, grad_fn=<NllLossBackward0>))

In [54]:
idx = torch.zeros((1, 1), dtype=torch.long)
decode(m.generate(idx, max_new_tokens=100)[0].tolist())

"\nl-QYjt'CL?jLDuQcLzy'RIo;'KdhpV\nvLixa,nswYZwLEPS'ptIZqOZJ$CA$zy-QTkeMk x.gQSFCLg!iW3fO!3DGXAqTsq3pdgq"

### Train the model

In [55]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [56]:
batch_size = 32
for steps in range(1000): # increase number of steps for good results...
    # sample a batch of data
    xb, yb = get_batch('train')
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)  # set gradients to 0
    loss.backward()  # getting new gradients for all the weights
    optimizer.step()  # use the gradients to update the weights

print(loss.item())


3.744039535522461


In [57]:
idx = torch.zeros((1, 1), dtype=torch.long)
decode(m.generate(idx, max_new_tokens=100)[0].tolist())

"\nN3vVoesMyas:Iocindad.e-NNSqYPso&bFho&$;BQ$dZTMf'mKlf;DRPm'W,esPzyXAzCA$;GunqCEy&Oy;ZxjKVhmrdhxCAbTSp"

## Attention

### The math trick in self-attention

In [58]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch size, time, channels
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

#### Version 1: averaging past context with `for` loops, the weakest form of aggregation

In [59]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        # the previous tokens are from 0 up to this current token including it 
        xprev = x[b, :t+1]  # shape: (t, C)
        xbow[b, t] = torch.mean(xprev, 0)

*This is ok but very inefficient. Let's do the same thing but with matrix multiplication for better performance*

#### Version 2: using matrix multiply as weighted aggregation


In [60]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)  # do this to get the average of the columns in b 
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [63]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)  # wei is `a` above
print(f"{wei = }")
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)

wei = tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [70]:
torch.sum(xbow - xbow2)

tensor(1.3947e-07)