In [24]:
with open('../input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
print(text[:150])
print(f"Length of text: {len(text)}")

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

A
Length of text: 1115394


In [25]:
chars = sorted(list(set(text)))
n_vocab = len(chars)


print("".join(chars))
print(f"Vocabulary size: {n_vocab}")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary size: 65


### Tokenizer

We have a custom tokenizer, its a character level tokenizer for the sake of simplicity

Some popular tokenizers includes tiktoken (byte pair encoding), sentencepiece (sub word unit encoding)

The above stated tokenizers have very large vocabulary (~50k tokens) but this results in much smaller sequences

in our case the char level token has only 65 tokens so the resulting sequence will be a one to one mapping of each character and length of sequence will scale linearly (which is bad)

> TODO use one of the popular tokenizers later while implementing to see the difference

In [26]:
char2idx = { ch: i for i, ch in enumerate(chars) }
idx2char = { i: ch for i, ch in enumerate(chars) }

encode = lambda string: [char2idx[char] for char in string]
decode = lambda tensor: "".join([idx2char[idx] for idx in tensor])

print(encode("hii there"))
print(decode(encode("hii there")))


[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


### Tokenize the dataset

In [27]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)

print(data.shape, data.dtype)
print(data[:100])
print(decode(data[:100].tolist()))

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


### Train - Validate Split

In [28]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(len(train_data), len(val_data))

1003854 111540


### hyperparameters

`block_size`

> we train the transformer on the above dataset as chunks, feeding in the entire dataset at once would be too computationally expensive, so we ranomly sample "chunks" of sequences from the dataset and train on them. The length of this sampled sequence is determined by block_size

`n_vocab`

> length of vocabulary, vocabulary is basically the number of unique tokens that our transformer will see and generate

In [29]:
block_size = 8
seed = 1337
batch_size = 4
n_embedding = n_vocab

In one of these sequences, there are multiple examples packed in it. in a sequence of length 8 there are 8 unique training examples

as such the `+1` is to accomodate a `y` for the last training sample, since `y` starts at an offset of `+1`

### Note

> The reason why multiple training samples are taken from a single sequence ranging from `1 - block_size` is not just to make it computationally efficient but to get the transformer used to seeing sequences of length in that range. `block_size` is essentially the `context_length` in transformers. During generation as well, when we keep appending generated tokens and during the next forward pass the transformer only sees the last `block_size` tokens

In [30]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]

print(decode(x.tolist()))
print(x, y)

for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    
    print(f"when input in: {context} the target: {target}")

First Ci
tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor([47, 56, 57, 58,  1, 15, 47, 58])
when input in: tensor([18]) the target: 47
when input in: tensor([18, 47]) the target: 56
when input in: tensor([18, 47, 56]) the target: 57
when input in: tensor([18, 47, 56, 57]) the target: 58
when input in: tensor([18, 47, 56, 57, 58]) the target: 1
when input in: tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input in: tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input in: tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [31]:
torch.manual_seed(seed)

def get_batch(split):
    
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    
    return x, y
    
xb, yb = get_batch('train')
print(f"inputs, {xb.shape}")
print(xb)
print(f"targets, {yb.shape}")
print(yb)

print('-' * 40)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context}, target is {target}")


inputs, torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets, torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----------------------------------------
when input is tensor([24]), target is 43
when input is tensor([24, 43]), target is 58
when input is tensor([24, 43, 58]), target is 5
when input is tensor([24, 43, 58,  5]), target is 57
when input is tensor([24, 43, 58,  5, 57]), target is 1
when input is tensor([24, 43, 58,  5, 57,  1]), target is 46
when input is tensor([24, 43, 58,  5, 57,  1, 46]), target is 43
when input is tensor([24, 43, 58,  5, 57,  1, 46, 43]), target is 39
when input is tensor([44]), target is 53
when input is tensor([44, 53]), target is 56
when input is tensor([44, 53, 56]), target

### Language Model

For the sake of simplicity we use the simplest form of neural network, the bigram language model

### Note

> idx has a shape of `(B, T)`. batch, time dimensions respectively

> output has a shape of `(B, T, C)`. where `C` is the embedding dimension

How output becomes that shape is basically, each token idx has an associated `(65, )` dimensional vector in the embedding table, since there are 8 tokens in a sequence (block size), there will be a corresponding embedding vector for each of those tokens. this is done for all sequences in the batch (4 sequence in a batch)

In [46]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(seed)

class BigramLangugeModel(nn.Module):
    
    def __init__(self, n_vocab):
        super().__init__()
        
        self.token_embedding_table = nn.Embedding(n_vocab, n_embedding)
        
    def forward(self, idx, targets):
        print(idx.shape)
        logits: torch.Tensor = self.token_embedding_table(idx)
        
        # logits is of shape (B, T, C) however cross entropy loss expects (B, C, T)
        
        B, T, C = logits.shape
        logits = logits.view(B * T, C)
        targets = targets.view(B * T)
        loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
m = BigramLangugeModel(n_vocab)
logits, loss = m(xb, yb)
print(logits.shape, loss.shape)

print(logits)
print(loss)

torch.Size([4, 8])
torch.Size([32, 65]) torch.Size([])
tensor([[-1.5101, -0.0948,  1.0927,  ..., -0.6126, -0.6597,  0.7624],
        [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
        [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305],
        ...,
        [-2.1910, -0.7574,  1.9656,  ..., -0.3580,  0.8585, -0.6161],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [-0.6787,  0.8662, -1.6433,  ...,  2.3671, -0.7775, -0.2586]],
       grad_fn=<ViewBackward0>)
tensor(4.8786, grad_fn=<NllLossBackward0>)
