In [293]:
import torch
import numpy as np 
import time
from dataclasses import dataclass
from nanogpt import GPT

In [294]:
#hyperparameters
batch_size = 64
block_size = 64

#load data
train_data = np.memmap('../data/shakespeare_char/train.bin', dtype=np.uint16, mode='r')
val_data = np.memmap('../data/shakespeare_char/val.bin', dtype=np.uint16, mode='r')

g = torch.Generator().manual_seed(214748364)

def get_batch(type="train"):
    if type=="train":
        data = train_data 
    elif type=="val":
        data = val_data
    ix = torch.randint(len(data) - block_size, (batch_size,), generator=g)
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    return x, y

In [295]:
lr = 1e-1
beta2 = 0# 0.99 
beta1 = 0 #0.9
weight_decay = 0 #1e-1

@dataclass
class GPTConfig:
    block_size: int = 64
    vocab_size: int = 65
    n_layer: int = 1
    n_head: int = 4
    n_embd: int = 128
    bias: bool = False
    
gptconf = GPTConfig()
model = GPT(gptconf)
optimizer = model.configure_optimizers(weight_decay, lr, (beta1, beta2))

number of parameters: 0.21M
num decayed parameter tensors: 6, with 213,120 parameters
num non-decayed parameter tensors: 3, with 384 parameters


In [296]:
p0 = 0.9
p1 = 1.0
dropout_dict = {'transformer.wte.weight':p0,
                'transformer.wpe.weight':p0, 
                'transformer.h.0.ln_1.weight':p1, 
                'transformer.h.0.attn.c_attn.weight':p0, 
                'transformer.h.0.attn.c_proj.weight':p0, 
                'transformer.h.0.ln_2.weight':p1, 
                'transformer.h.0.mlp.c_fc.weight':p0, 
                'transformer.h.0.mlp.c_proj.weight':p0, 
                'transformer.ln_f.weight':p1}

In [297]:
#one epoch
for _ in range(1000):
    X, Y = get_batch()
    t0 = time.time()
    logits, loss = model(X, Y)

    loss.backward()
    with torch.no_grad():
        for pn, p in model.named_parameters():
            if p.requires_grad:
                p -= lr* p.grad * torch.bernoulli(torch.ones_like(p)*(1-dropout_dict[pn]))
    #optimizer.step()
    optimizer.zero_grad(set_to_none=True)
    t1 = time.time()
    #print("time: ", t1 - t0)
    #print("%.2f" % loss.item())

In [298]:
with torch.no_grad():
    X, Y = get_batch("val")
    logits, loss = model(X, Y)
    print("%.4f" % loss.item())

2.4387
