In [1]:
%load_ext autoreload
%autoreload 2

In [20]:
import random
import torch
import time
import makemore

In [3]:
def random_digit():
    return random.choice("0123456789")

In [5]:
def palindrome():
    start_letters = [random_digit() for _ in range(5)]
    end_letters = list(reversed(start_letters))
    return "".join(start_letters + end_letters)
make_palindrome()

'2580110852'

In [9]:
train_dataset, test_dataset = makemore.generate_datasets(palindrome)

number of examples in the dataset: 10000
max word length: 10
number of unique characters in the vocabulary: 10
vocabulary:
0123456789
split up the dataset into 9000 training examples and 1000 test examples


In [18]:
vocab_size = train_dataset.get_vocab_size()
block_size = train_dataset.get_output_length()
print(f"dataset determined that: {vocab_size=}, {block_size=}")

config = makemore.ModelConfig(vocab_size=vocab_size, block_size=block_size)
model = makemore.Transformer(config)
model.to("cuda")
print(f"model #params: {sum(p.numel() for p in model.parameters())}")
learning_rate = 5e-4
weight_decay = 0.01
batch_size = 32
num_workers = 4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay,
                              betas=(0.9, 0.99), eps=1e-8)
batch_loader = makemore.InfiniteDataLoader(train_dataset, batch_size=batch_size, pin_memory=True,
                                           num_workers=num_workers)


dataset determined that: vocab_size=11, block_size=11
number of parameters: 0.20M
model #params: 202176


In [23]:
for step in range(200):
    t0 = time.time()

    # get the next batch, ship to device, and unpack it to input and target
    batch = batch_loader.next()
    batch = [t.to("cuda") for t in batch]
    X, Y = batch

    # feed into the model
    logits, loss = model(X, Y)

    # calculate the gradient, update the weights
    model.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    # wait for all CUDA work on the GPU to finish then calculate iteration time taken
    torch.cuda.synchronize()
    t1 = time.time()

    # logging
    if step % 10 == 0:
        print(f"step {step} | loss {loss.item():.4f} | step time {(t1-t0)*1000:.2f}ms")

# evaluate the model
train_loss = makemore.evaluate(model, train_dataset, batch_size=100, max_batches=10)
test_loss = makemore.evaluate(model, test_dataset, batch_size=100, max_batches=10)           
print(f"step {step} train loss: {train_loss} test loss: {test_loss}")
                
# sample from the model
makemore.print_samples(model, train_dataset, test_dataset, num=10)


step 0 | loss 1.0588 | step time 14.16ms
step 10 | loss 1.0470 | step time 3.29ms
step 20 | loss 1.0520 | step time 3.29ms
step 30 | loss 1.0631 | step time 3.97ms
step 40 | loss 1.0498 | step time 3.23ms
step 50 | loss 1.0523 | step time 3.43ms
step 60 | loss 1.0563 | step time 3.76ms
step 70 | loss 1.0473 | step time 3.22ms
step 80 | loss 1.0535 | step time 3.28ms
step 90 | loss 1.0493 | step time 3.78ms
step 100 | loss 1.0465 | step time 3.17ms
step 110 | loss 1.0542 | step time 3.27ms
step 120 | loss 1.0511 | step time 3.25ms
step 130 | loss 1.0517 | step time 3.75ms
step 140 | loss 1.0586 | step time 3.21ms
step 150 | loss 1.0511 | step time 3.20ms
step 160 | loss 1.0522 | step time 3.19ms
step 170 | loss 1.0575 | step time 3.33ms
step 180 | loss 1.0652 | step time 3.18ms
step 190 | loss 1.0551 | step time 3.17ms
step 199 train loss: 1.0521115064620972 test loss: 1.0521448850631714
--------------------------------------------------------------------------------
0 samples that are 