In [1]:
import os
import torch
import torch.nn as nn
from torch.nn import functional as F


In [2]:
# hardware acceleration
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)
print(device)

cuda


In [3]:
# get input
if not os.path.exists('input.txt'):
    import requests
    data = requests.get('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
    with open('input.txt', 'w') as f:
        f.write(data.text)
    print('finished downloading input data')
else:
    print('already have input data')

already have input data


In [4]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    print('n_chars:', len(text))

n_chars: 1115394


In [5]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print('vocab_size:', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab_size: 65


### Tokenizer
We need to convert the raw text data as a sequence of integers according to some encoding or structure. Since this is a character level model, we just create a bijective mapping from characters to integers, but there are much more sophisticated tokenization techniques like BPE used by [OpenAi](https://github.com/openai/tiktoken), which is a sub-word tokenizer.

Sub-word tokenizers are desirable because they allow the model to better understand grammer by breaking up words into chunks that are of common form. For example the tokenizer may encode '-ing' '-ed' particles into their own chunks so the model can learn the tense of a sentence.

In [7]:
# create mapping from characters to integers
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # takes a string: outputs a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # takes a list of integers, output a string


In [8]:
# encode text
data = torch.tensor(encode(text), dtype=torch.long)

# create training and validation splits
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

### Sequential learning
Token generation is an **auto-regressive** task. That is, we want to predict the next value in the sequence based on previous values. A predicted value is modeled as the linear combination of its previous outputs plus some noise. Clearly, words, sentences and natural language in general are auto regressive in nature.

One of the benefits of training auto regressive models is that they are *self-supervised*. That is, we do not need to explicitly label our training data, as the corresponding output we want to predict is just the next value in the sequence.

In [9]:
batch_size = 64
block_size = 8 # also known as context_length

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]) # stack along dim 0
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

# why are we taking 8 outputs for each batch?
xb, yb = get_batch('train')
print(xb.shape, yb.shape)

torch.Size([64, 8]) torch.Size([64, 8])


---
### Review: Bigram

In [10]:
class BigramLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # [batch_size, time/block_size, channels/vocab_size] -> [B, T, C]
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # for F.cross_entropy, we need to flatten the B and T dims
            # there are B batches and T characters in each batch (with C=vocab_size channels)
            # so each row in the matrix below is some channel of a character from the batch
            logits = logits.view(B*T, C) 
            # each entry in the vector below corresponds to the target value of each character in logits[B*T, C]
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # get prediction from forward()
            logits, loss = self(idx)
            # only interested in predicting the next character
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLM(vocab_size).to(device)
logits, loss = m(xb, yb)

print(logits.shape)
print(loss)

# 0 corresponds to new-line
idx = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(idx=idx, max_new_tokens=100)[0].tolist()))

torch.Size([512, 65])
tensor(4.5317, device='cuda:0', grad_fn=<NllLossBackward0>)

,Tc.'MPBXIvo;  c:.&PSprA'T!rWLlQLZfQs!q&-?fXtNt,Ai
a&?Kr QHL
Wo:p-ltOpasvYygj,qRTidgkclyamAWN:D&bgdw


In [11]:
# train Bigram model
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [12]:
# estimate loss by taking an average loss over several batches
@torch.no_grad()
def estimate_loss(model, eval_iters=100):
    out = {}
    model.eval() # set model to eval phase
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean().item()
    model.train() # set model to train phase
    return out

In [13]:
max_steps = 10000
for steps in range(max_steps):
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if steps % 1000 == 0:
        print(f"{steps}/{max_steps} : {estimate_loss(m)}")

0/10000 : {'train': 4.576886177062988, 'val': 4.583799839019775}
1000/10000 : {'train': 3.5614309310913086, 'val': 3.575105667114258}
2000/10000 : {'train': 3.000180244445801, 'val': 3.01057505607605}
3000/10000 : {'train': 2.7102270126342773, 'val': 2.731668710708618}
4000/10000 : {'train': 2.5857350826263428, 'val': 2.6017632484436035}
5000/10000 : {'train': 2.536057949066162, 'val': 2.5493409633636475}
6000/10000 : {'train': 2.497814178466797, 'val': 2.516511917114258}
7000/10000 : {'train': 2.4835495948791504, 'val': 2.4963839054107666}
8000/10000 : {'train': 2.4720680713653564, 'val': 2.4908885955810547}
9000/10000 : {'train': 2.4658913612365723, 'val': 2.491212844848633}


In [14]:
idx = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(idx=idx, max_new_tokens=100)[0].tolist()))


TUMy ont my hares, h heveistyldiese le thengafthan im, inkendiks th be
QUCl he
roct gl.
KI' y meathe


---
### Longer Context Length Model
Pytorch implementation of what we had in MLP-3

In addition, we will add a *positional embedding* layer along the *token embedding* layer. Hence, the model will also learn the positional relationship between different tokens in addition to their semantic  relationship learned by the token embedding layer. This layer will be crucial for self-attention in transformers.

In [15]:
class MLP_M(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        n_embd = 48
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.flatten = nn.Flatten(1)
        self.layer1 = nn.Linear(block_size*n_embd, 100)
        self.bn1 = nn.BatchNorm1d(100)
        self.layer2 = nn.Linear(100, 200)
        self.bn2 = nn.BatchNorm1d(200)
        self.layer3 = nn.Linear(200, 300)
        self.bn3 = nn.BatchNorm1d(300)
        self.layer4 = nn.Linear(300, vocab_size)

        self.relu1 = nn.LeakyReLU()
        self.relu2 = nn.LeakyReLU()
        self.relu3 = nn.LeakyReLU()
        
        

    def forward(self, idx, targets=None):
        tok_emb = self.token_embedding_table(idx) 
        pos_emb = self.position_embedding_table(torch.arange(idx.shape[1], device=device))
        x = tok_emb + pos_emb
        x = self.flatten(x)
        x = self.layer1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.layer2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.layer3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        logits = self.layer4(x)
        if targets is None:
            loss = None
        else: 
            target = targets[:, -1]
            loss = F.cross_entropy(logits, target)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for i in range(max_new_tokens):
            # get prediction from forward()
            logits, _ = self(idx[:, i:])
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
        
xb, yb = get_batch('train')
m2 = MLP_M(vocab_size).to(device)
logits, loss = m2(xb, yb)
print(xb.shape)
print(logits.shape)

torch.Size([64, 8])
torch.Size([64, 65])


In [16]:
# train the model
optimizer2 = torch.optim.AdamW(m2.parameters(), lr=1e-3)

In [17]:
max_steps = 50000
for steps in range(max_steps):
    xb, yb = get_batch('train')

    logits, loss = m2(xb, yb)
    optimizer2.zero_grad(set_to_none=True)
    loss.backward()
    optimizer2.step()

    if steps % 10000 == 0:
        print(f"{steps}/{max_steps} : {estimate_loss(m2)}")

0/50000 : {'train': 4.164349555969238, 'val': 4.165612697601318}
10000/50000 : {'train': 1.6873383522033691, 'val': 1.8061630725860596}
20000/50000 : {'train': 1.6028873920440674, 'val': 1.7605390548706055}
30000/50000 : {'train': 1.5525975227355957, 'val': 1.7518310546875}
40000/50000 : {'train': 1.5078599452972412, 'val': 1.7562731504440308}


In [18]:
m2.eval(); # so BatchNorm layers will use the running mean and var
idx = torch.ones((1, block_size), dtype=torch.long, device=device)
print(decode(m2.generate(idx=idx, max_new_tokens=200)[0].tolist()))
m2.train();

        hagge
If the noble run my man, thou us:
A quickate a king, we so for unworn times in Sorrow him, this will so was some to take all with have a lead
Which you are are such and out son and voice of requ
