## Loading Data

In [1]:
# ! wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

## Training the GPT Model

In [2]:
import torch

split_size = 0.9
batch_size = 64
block_size = 256
dropout = 0.2
n_layers = 6
n_embd = 384
n_heads = 6

max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 200
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
from data import CharData

data = CharData(text, params={
    'split_size': split_size,
    'batch_size': batch_size,
    'block_size': block_size,
    'device': device,
})

data loader successfully initiated.


In [4]:
from gpt import GPTLanguageModel

model = GPTLanguageModel(params={
    'vocab_size': data.get_vocab_size(),
    'block_size': block_size,
    'n_layers': n_layers,
    'dropout': dropout,
    'n_heads': n_heads,
    'n_embd': n_embd,
    'device': device,
})
model.to(device)

GPTLanguageModel(
  (token_embedding_table): Embedding(65, 384)
  (pos_embedding_table): Embedding(256, 384)
  (transformer): Sequential(
    (0): Block(
      (self_attn_heads): MultiHeads(
        (heads): ModuleList(
          (0): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (1): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (2): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
 

In [5]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
from utils import estimate_loss

for iter in range(max_iters):

    # check the loss once every eval_iters intervals pass
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model, data, eval_iters)
        print(f"iter {iter} - train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    # get a batch of data
    X, Y = data.get_batch('train')

    # evaluate the loss
    logits, loss = model(X, Y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(data.decode(model.generate(context, max_new_tokens=500)[0].tolist()))