# Requirements

In [None]:
! pip install catalyst==20.08.2

## Train a character-level GPT on some text data

The inputs here are simple text files, which we chop up to individual characters and then train GPT on. So you could say this is a char-transformer instead of a char-rnn. Doesn't quite roll off the tongue as well. In this example we will feed it some shakespear, which we'll get it to predict character-level.

In [1]:
minGPT_coef = 8
minGPT_coef = min(max(minGPT_coef, 1), 8)

In [2]:
# make deterministic
from catalyst import utils
utils.set_global_seed(42)

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F

In [4]:
import math
from torch.utils.data import Dataset, DataLoader

class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = list(set(data))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return math.ceil(len(self.data) / (self.block_size + 1))

    def __getitem__(self, idx):
        # we're actually going to "cheat" and pick a spot in the dataset at random
        i = np.random.randint(0, len(self.data) - (self.block_size + 1))
        chunk = self.data[i:i+self.block_size+1]
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [5]:
block_size = 128//minGPT_coef # spatial extent of the model for its context

In [6]:
# ! wget https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt

In [7]:
# you can download this file at https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt
text = open('input.txt', 'r').read() # don't worry we won't run out of file handles
train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters
train_loader = DataLoader(train_dataset, batch_size=512, num_workers=4)

data has 1115394 characters, 65 unique.


In [8]:
from mingpt.model import GPT
from mingpt.utils import prepare_optimizer

model = GPT(
    vocab_size=train_dataset.vocab_size, 
    block_size=block_size, 
    n_embd=512//minGPT_coef, 
    n_layer=8//minGPT_coef,
    n_head=8//minGPT_coef
)
optimizer = prepare_optimizer(
    model=model, 
    learning_rate=6e-4, 
    weight_decay=0.1, 
    betas=(0.9, 0.95)
)

In [9]:
import math
from catalyst.core import CallbackOrder, CallbackNode
from catalyst import dl


class CustomRunner(dl.Runner):
    
    def _handle_batch(self, batch):
        x, targets = batch
        logits = self.model(x)
        loss = F.cross_entropy(
            logits.view(-1, logits.size(-1)), targets.view(-1)
        )
        self.input = batch
        self.output = logits
        self.batch_metrics.update(**{"loss": loss})
        
        if self.is_train_loader:
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()


class CustomCallback(dl.Callback):

    def __init__(self, learning_rate, warmup_tokens, final_tokens, lr_decay=True):
        super().__init__(order=CallbackOrder.scheduler, node=CallbackNode.all)
        self.learning_rate = learning_rate
        self.tokens = 0
        self.final_tokens = final_tokens
        self.lr_decay = lr_decay
        self.warmup_tokens = warmup_tokens

    def on_batch_end(self, runner):
        if not runner.is_train_loader:
            return
        optimizer = runner.optimizer
        x, y = runner.input

        if self.lr_decay:
            self.tokens += (y >= 0).sum()  # number of tokens processed this step (i.e. label is not -100)
            if self.tokens < self.warmup_tokens:
                # linear warmup
                lr_mult = float(self.tokens) / float(max(1, self.warmup_tokens))
            else:
                # cosine learning rate decay
                progress = float(self.tokens - self.warmup_tokens) / float(
                    max(1, self.final_tokens - self.warmup_tokens))
                lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
            lr = self.learning_rate * lr_mult
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr


runner = CustomRunner()
callbacks = [
    CustomCallback(
        learning_rate=6e-4,
        warmup_tokens=512*20, 
        final_tokens=200*len(train_dataset)*block_size,
    )
]

In [10]:
runner.train(
    loaders={"train": train_loader}, 
    model=model,
    optimizer=optimizer, 
    callbacks=callbacks,
    num_epochs=10, 
    logdir="./logs",
    timeit=True,
    load_best_on_end=True,
)


Attention, there is only one dataloader - train



[2020-08-21 00:00:22,882] 
1/10 * Epoch 1 (train): _timer/_fps=5410.9082 | _timer/batch_time=0.0967 | _timer/data_time=0.0031 | _timer/model_time=0.0936 | loss=2.9761
[2020-08-21 00:00:35,796] 
2/10 * Epoch 2 (train): _timer/_fps=5153.4843 | _timer/batch_time=0.1004 | _timer/data_time=0.0019 | _timer/model_time=0.0985 | loss=2.4049
[2020-08-21 00:00:49,181] 
3/10 * Epoch 3 (train): _timer/_fps=4965.4075 | _timer/batch_time=0.1040 | _timer/data_time=0.0019 | _timer/model_time=0.1022 | loss=2.3017
[2020-08-21 00:01:02,332] 
4/10 * Epoch 4 (train): _timer/_fps=5060.7588 | _timer/batch_time=0.1021 | _timer/data_time=0.0022 | _timer/model_time=0.0999 | loss=2.2500
[2020-08-21 00:01:15,734] 
5/10 * Epoch 5 (train): _timer/_fps=4977.5494 | _timer/batch_time=0.1041 | _timer/data_time=0.0020 | _timer/model_time=0.1022 | loss=2.2175
[2020-08-21 00:01:29,931] 
6/10 * Epoch 6 (train): _timer/_fps=4760.8442 | _timer/batch_time=0.1103 | _timer/data_time=0.0021 | _timer/model_time=0.1082 | loss=2.193

In [11]:
# alright, let's sample some character-level shakespear
from mingpt.utils import sample

context = "O God, O God!"
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(runner.device)
y = sample(model, x, 2000, temperature=0.9, sample=True, top_k=5)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

O God, O God! Thoughter to shead,
I shall, that ther he till sor have thee digne, a to me that to the tatie,
Those this to the sto the strought he my tin as seater, I shead
Whiste he she sir the hee and to be a bard to that stentistill ster, the his a so the thes he had
I have shall and ter and
I comme som shade selve son to by son, sore some he deart teated so the art,
I some his he to mus, sir, ting there heartin tis sees,
To shou he the havorsell be a that to hand his
Wath be heate then thim as to bright all her that be ther hat he self there thy and
Is sir this to stay, my mucester the the my lett of and aly, a me.

This he she a se the to be to be andes seak his and the my thise and to bungest and most astrath stere the herear bead, the shall hour he to stay besst son murs, I this the so teat to heeady til sintly ter sing as a thee dit a as a but, a and hat shere shere, I a his and tearth a the all she his there a to sit, sirs off your thy as ter hearss that to sice ond a statter'

In [None]:
# well that was fun