In [71]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import math
from tqdm import tqdm

In [281]:
class CharDataset(Dataset):
    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
        
    def __len__(self):
        return len(self.data) - self.block_size
    
    def __getitem__(self, idx):
        chunk = self.data[idx:idx + self.block_size + 1]
        dix = [self.stoi[s] for s in chunk]
        
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        
        return x, y

In [282]:
text = open('rainer-maria-rilke.txt', 'r').read() # Reads as chars
block_size = 128
train_dataset = CharDataset(text, block_size)

data has 28938 characters, 69 unique.


In [288]:
class SelfAttention(nn.Module):
    def __init__(self, n_embed, n_head, block_size):
        assert n_embed % n_head == 0
        super().__init__()
        self.key = nn.Linear(n_embed, n_embed)
        self.query = nn.Linear(n_embed, n_embed)
        self.value = nn.Linear(n_embed, n_embed)
        
        self.attn_drop = nn.Dropout(0.1)
        self.resid_drop = nn.Dropout(0.1)
        
        self.proj = nn.Linear(n_embed, n_embed)
        
        self.mask = torch.tril(torch.ones(block_size, block_size)) \
                    .view(1, 1, block_size, block_size).cuda()
            
        self.n_head = n_head
        
        print(f"Block n_head: {n_head}")
        
    def forward(self, x, layer_past=None):
        B, T, C = x.size()

        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v 
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_drop(self.proj(y))
        return y
    
    
class Block(nn.Module):
    def __init__(self, n_embed, n_head, block_size):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)
        self.attn = SelfAttention(n_embed, n_head, block_size)
        self.mlp = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.GELU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(0.1))
        
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x


class picoGPT(nn.Module):
    def __init__(self, n_embed, n_head, n_layers, block_size, vocab_size):
        super().__init__()
        
        self.tok_emb = nn.Embedding(vocab_size, n_embed)
        self.pos_emb = nn.Parameter(torch.zeros(1, block_size, n_embed))
        self.drop = nn.Dropout(0.1)
        
        # transformer
        self.blocks = nn.Sequential(*[Block(n_embed, n_head, block_size) \
                                      for _ in range(n_layers)])
        
        # decoder
        self.ln_f = nn.LayerNorm(n_embed)
        self.head = nn.Linear(n_embed, vocab_size, bias=False)
        
        self.block_size = block_size
        print(f"number of parameters: {sum(p.numel() for p in self.parameters())}")
        
    def forward(self, x, targets=None):
        batch, token = x.size()
        assert token <= self.block_size, "Cannot forward, model block size is exhausted."
        
        token_embeddings = self.tok_emb(x)
        position_embeddings = self.pos_emb[:, :token, :]
        x = self.drop(token_embeddings + position_embeddings)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
            
        return logits, loss

In [289]:
n_embed = 32
n_head = 2
n_layers = 2

model = picoGPT(
    n_embed,
    n_head,
    n_layers,
    train_dataset.block_size,
    train_dataset.vocab_size
).cuda()

Block n_head: 2
Block n_head: 2
number of parameters: 33984


In [290]:
optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=6e-4, 
    betas=(0.9, 0.95)
)

In [291]:
def run_epoch():
    loader = DataLoader(train_dataset,
                        shuffle=True,
                        pin_memory=True,
                        batch_size=1,
                        num_workers=4
                       )
    losses = []
    
    for it, (x, y) in tqdm(enumerate(loader), total=len(loader)):
        x = x.cuda()
        y = y.cuda()
        
        with torch.set_grad_enabled(True):
            logits, loss = model(x, y)
            loss = loss.mean()
            losses.append(loss.item())
            
        model.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

In [298]:
for epoch in range(10):
    run_epoch()

100%|███████████████████████████████████████████████████| 28810/28810 [03:42<00:00, 129.31it/s]
100%|███████████████████████████████████████████████████| 28810/28810 [03:44<00:00, 128.50it/s]
100%|███████████████████████████████████████████████████| 28810/28810 [03:43<00:00, 128.94it/s]
100%|███████████████████████████████████████████████████| 28810/28810 [03:40<00:00, 130.58it/s]
100%|███████████████████████████████████████████████████| 28810/28810 [03:23<00:00, 141.77it/s]
100%|███████████████████████████████████████████████████| 28810/28810 [03:41<00:00, 129.87it/s]
100%|███████████████████████████████████████████████████| 28810/28810 [03:43<00:00, 129.14it/s]
100%|███████████████████████████████████████████████████| 28810/28810 [03:20<00:00, 143.64it/s]
100%|███████████████████████████████████████████████████| 28810/28810 [03:40<00:00, 130.91it/s]
100%|███████████████████████████████████████████████████| 28810/28810 [03:40<00:00, 130.65it/s]


In [319]:
model.eval()
steps = 110

x = "Then I am shaken"
x = torch.tensor([train_dataset.stoi[s] for s in x], dtype=torch.long)[None,...].cuda()
do_sample = True

with torch.no_grad():
    for k in range(steps):
        x_cond = x if x.size(1) <= block_size else x[:, -block_size]
        logits, _ = model(x_cond)
        
        logits = logits[:, -1, :]
        
        probs = F.softmax(logits, dim=-1)
        
        ix = torch.multinomial(probs, num_samples=1)
        
        x = torch.cat((x, ix), dim=1)

In [320]:
result = ""
for c in [train_dataset.itos[int(i)] for i in x[0]]:
    if c == '\n':
        print(result)
        result = ""
        continue
    result += c
print(result)

Then I am shaken a soft
Whosu ast I as thopthere floats on.


A with caddes with ras fuore became in chese—
Ando the dark, tha
