### Load Data --> Shakespear dataset

In [2]:
txt_pth = './shakespeare_train.txt'
with open(txt_pth, 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print(text[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


### Chars, encoder and decoder

In [4]:
chars = sorted(list(set(text)))
vocab_size = (len(chars))
print(vocab_size, ''.join(chars))

62 
 !&',-.:;?ABCDEFGHIJKLMNOPQRSTUVWYZabcdefghijklmnopqrstuvwxyz


In [5]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[l] for l in s]
decode = lambda x: ''.join([itos[c] for c in x])

### Train and Test splits

In [6]:
import torch
torch.manual_seed(1337)

<torch._C.Generator at 0x1208bf4b0>

In [7]:
data = torch.tensor(encode(text))
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

### Data Loading

In [8]:
batch_size = 4
context_size = 8

In [9]:
def get_batch(split='train'):
    data = train_data if split == 'train' else val_data
    idxs = torch.randint(len(data) - context_size, (batch_size,))
    x = torch.stack([data[i:i + context_size] for i in idxs])
    y = torch.stack([data[i + 1:i + 1 + context_size] for i in idxs])
    return x, y

In [11]:
x, y = get_batch()
x, y

(tensor([[48, 51, 43, 36, 49, 55, 47, 60],
         [50, 53,  1, 50, 56, 53,  1, 39],
         [36, 38, 55, 44, 50, 49,  7,  0],
         [28, 31, 30, 31, 29,  8,  0, 33]]),
 tensor([[51, 43, 36, 49, 55, 47, 60,  1],
         [53,  1, 50, 56, 53,  1, 39, 40],
         [38, 55, 44, 50, 49,  7,  0,  0],
         [31, 30, 31, 29,  8,  0, 33, 43]]))

### Bigram model

In [24]:
## Constructor, Forward, Generate
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, target=None):
        # idx, target are both (B,T) tensors of integers
        logits = self.embedding_table(idx) # (B,T,C)
        
        if target is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            loss = F.cross_entropy(logits, target.view(B*T)) # Cross entropy performs softmax on embedding within

        return logits, loss
    
    def generate(self, idx, max_context_length=100):
        # idx - (B, T)
        for i in range(max_context_length):
            logits, loss = self(idx) # (B, T, C)
            # For Bigram model
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, -1) # (B, C)
            # Sampling from the probability distribution to generate next token
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat([idx, idx_next], dim=1)
        return idx

In [25]:
lm = BigramLanguageModel(vocab_size)
x, y = get_batch()
logits, loss = lm(x, y)
print(loss)

inp_token = torch.zeros((1, 1), dtype=torch.long)
print(decode(lm.generate(inp_token)[0].tolist()))

tensor(4.4663, grad_fn=<NllLossBackward0>)

TyTydxJ;bsuTyU-yvdx&BTnle-qyZRNZ;J!'Q,L?gU';aoY,LYvhCrrqk cZ.NvT-qL?llYZrAoJjgzSO:AeL
twZ;TZAgJ'rjqS


### Training and Generation

In [92]:
optimizer = torch.optim.AdamW(lm.parameters(), lr=1e-3)

bs = 16
for iter in range(10000):
    x, y = get_batch('train')
    logits, loss = lm(x, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if iter % 500 == 0:
        print(loss.item())

2.444016695022583
2.29287052154541
2.4572906494140625
2.3702685832977295
2.270362138748169
2.3412535190582275
2.3811817169189453
2.3535118103027344
2.094773769378662
2.0756685733795166
2.4022254943847656
2.330070972442627
2.421736001968384
2.3064239025115967
2.0203874111175537
1.9376918077468872
2.5012989044189453
2.4261655807495117
2.1869139671325684
2.6687002182006836


## To do
- add Language model head
- add positional embedding
- implement attention
- multi-head attention

In [42]:
import torch
from torch.nn.functional import softmax

In [6]:
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [24]:
# xbow[b, t] = mean({i<=t}, x[b, i])
xbow = torch.zeros_like(x)
for b in range(B):
    for t in range(T):
        x_prev = x[b, :t+1] # (t, c)
        xbow[b, t] = torch.mean(x_prev, dim=0)

print(x[0], xbow[0])

tensor([[ 1.2687,  0.9435],
        [ 1.0437, -0.1767],
        [-0.0919, -0.3312],
        [-0.3671,  0.7606],
        [ 1.5075,  0.7157],
        [ 0.1446, -1.2157],
        [ 0.0904,  1.6117],
        [ 2.5648, -0.6219]]) tensor([[1.2687, 0.9435],
        [1.1562, 0.3834],
        [0.7401, 0.1452],
        [0.4633, 0.2990],
        [0.6722, 0.3824],
        [0.5843, 0.1160],
        [0.5137, 0.3297],
        [0.7701, 0.2107]])


In [78]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(dim=1, keepdim=True)
xbow2 = wei @ x # (T,T) @ (B, T, C) -- (B, T, C)

In [79]:
torch.allclose(xbow, xbow2)

True

In [96]:
wei = torch.zeros((T, T))
tril = torch.tril(torch.ones_like(wei))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = softmax(wei, dim=1)

In [98]:
xbow3 = wei @ x # (T,T) @ (B, T, C) -- (B, T, C)
torch.allclose(xbow3, xbow2)

True