## Loading Data

In [1]:
# ! wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

## Training the GPT Model

In [2]:
import torch

split_size = 0.9
batch_size = 32
block_size = 8
n_embd = 32
n_heads = 4

max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
from data import CharData

data = CharData(text, params={
    'split_size': split_size,
    'batch_size': batch_size,
    'block_size': block_size,
    'device': device,
})

data loader successfully initiated.


In [4]:
from gpt import GPTLanguageModel

model = GPTLanguageModel(params={
    'vocab_size': data.get_vocab_size(),
    'block_size': block_size,
    'n_heads': n_heads,
    'n_embd': n_embd,
    'device': device,
})
model.to(device)

GPTLanguageModel(
  (token_embedding_table): Embedding(65, 32)
  (pos_embedding_table): Embedding(8, 32)
  (transformer): Sequential(
    (0): Block(
      (self_attn_heads): MultiHeads(
        (heads): ModuleList(
          (0): Head(
            (key): Linear(in_features=32, out_features=8, bias=False)
            (query): Linear(in_features=32, out_features=8, bias=False)
            (value): Linear(in_features=32, out_features=8, bias=False)
          )
          (1): Head(
            (key): Linear(in_features=32, out_features=8, bias=False)
            (query): Linear(in_features=32, out_features=8, bias=False)
            (value): Linear(in_features=32, out_features=8, bias=False)
          )
          (2): Head(
            (key): Linear(in_features=32, out_features=8, bias=False)
            (query): Linear(in_features=32, out_features=8, bias=False)
            (value): Linear(in_features=32, out_features=8, bias=False)
          )
          (3): Head(
            (key): Lin

In [5]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [6]:
from utils import estimate_loss

for iter in range(max_iters):

    # check the loss once every eval_iters intervals pass
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model, data, eval_iters)
        print(f"iter {iter} - train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    # get a batch of data
    X, Y = data.get_batch('train')

    # evaluate the loss
    logits, loss = model(X, Y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

iter 0 - train loss 4.3034, val loss 4.3240
iter 500 - train loss 2.4128, val loss 2.4146
iter 1000 - train loss 2.2653, val loss 2.2918
iter 1500 - train loss 2.1904, val loss 2.2136
iter 2000 - train loss 2.1484, val loss 2.1672
iter 2500 - train loss 2.0992, val loss 2.1493
iter 3000 - train loss 2.0712, val loss 2.1244
iter 3500 - train loss 2.0396, val loss 2.1029
iter 4000 - train loss 2.0136, val loss 2.0907
iter 4500 - train loss 2.0002, val loss 2.0790
iter 4999 - train loss 1.9933, val loss 2.0767


In [7]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(data.decode(model.generate(context, max_new_tokens=500)[0].tolist()))


SBHet toth a dear gannest of I'll beasinest tomford's
But'st's is To.

HENPIUvorvy vord them of blance, and ham nurry my for then this then't agect and if thouse?

QUEES:
Then
's asan afejur and the sake plot.

POVEON HENCH:
That,
Nother what bastender
Thous ais leff'k that Oward is nothat wouldnes's of my these eway;
And Lloy make to pengue. wo nown:
Well, the givakes stink, beter the are your am lutts tuurs.

HARLEO:
He gray stimbtfedso,
Yought shall'd him.

RIORD Prather's chark,
Whe? telteve
