nano gpt by Andrej Karparthy

# Steps
1. Encode all the text first and then create the train and test
2. batch Sample from each dataset, where targets are inputs shifted backwards by timestep = 1 
    - input: [1,2,3,4,5]
    - output: [2,3,4,5]
3. Create bigram model, where logits are the output from the embedding layer
4. Create optimizer, train and evaluate the batch
5. Generate outputs
    - Start with context of size = (1,1) for a certain max

In [4]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('ng-video-lecture/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [29]:
batch_size = 32
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    xs = torch.stack([data[i: i + block_size] for i in ix])
    ys = torch.stack([data[i+1: i + block_size +1] for i in ix])
    return xs, ys

train_batch = get_batch('train')

In [23]:
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch(split)
            logits, loss = model(x, y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [46]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, vocab_size)
    def forward(self, x, target=None):
        B, T = x.shape
        logits = self.embedding(x) 
        # do not do this because we want to get the original logits during inference
#         logits = embedding_out.view(B * T, -1) # (B * T , C)
        
        if target is None:
            loss = None
        else:
            logits = logits.view(B * T, -1)
#             target = target.view(-1, 1) # (B*T, 1)
            target = target.view(-1)
            loss = F.cross_entropy(logits, target)
        return logits, loss
    
model = BigramLanguageModel(vocab_size)
# logit, loss = bgm(train_batch[0])

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for it in range(max_iters):
    if it % eval_interval == 0:
        losses = estimate_loss()
        print(f"Step {it}: train loss {losses['train']:4f}, val loss {losses['val']:4f}")
        
        xb, yb = get_batch('train')
        
        logits, loss = model(xb, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Step 0: train loss 4.609667, val loss 4.624401
Step 300: train loss 4.607289, val loss 4.617493
Step 600: train loss 4.592505, val loss 4.600222
Step 900: train loss 4.585400, val loss 4.595136
Step 1200: train loss 4.569260, val loss 4.582810
Step 1500: train loss 4.566514, val loss 4.564946
Step 1800: train loss 4.539031, val loss 4.546776
Step 2100: train loss 4.532931, val loss 4.552556
Step 2400: train loss 4.522796, val loss 4.531307
Step 2700: train loss 4.513802, val loss 4.521536


In [68]:
def generate(idx, max_new_tokens):
    for _ in range(max_new_tokens):
        logits, loss = model(idx)
        logit = logits[:,-1,:] # get last timestep, (B, C)
        probs = torch.softmax(logit, dim=-1)  # (B, C)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, idx_next], dim=1)
    return idx

idx = torch.zeros((1,1), dtype = torch.long)
sample = generate(idx, 500)
decode(sample[0].tolist())

'\n;v$ebjJmslGQWJWwEYWAZTuGIT-UQ!lqClGQ:OmM:zMxW,ZRlWMdxaGOdVMG!mlZUi qJvrrSBjELP-aTjc&YwEUzvY\nVccV-\n?LyFne3sUddVjVaZaaa&MKrbg-YwuLsanAj wkkp-U$,kd\nT-UyIsIxF LJj TySyQH:t KNqeLWWSE?G:t\neXW,pTt?DmMfRKe,SuDVpLKeDEvNKPHanPMfV.GkXHDcPzPqqqvu$WMdVUOVpIlEnTpddSuVv$WxHyp FH,kAM:pOVwKcsSgtHjrAZqStbkpVnvztvJlqdsuy!PUwusfMBcF.$WdA.KgtvZldSLndDrt!Xv$WW:qdWmWDOv$FxggtnSELooXub$:koBgAmyFrRHv$:O aZW pI-mstiYwD?aikaH,p-vzP-hlEnfYC!mHDdKPC\nBuHFqqmPMMxcgN!rSp.hPlWS!gmWSupBE-eIaBlErSuWleF!F\nRGDZCd.Z\n?oHbyfL!DLlEPzZh'