# Initial experiment: small vanilla transformer (NanoGPT) trained on plain addition

In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import torch
import lightning as L

from arithmetic_lm.tokenizer import CharTokenizer
from arithmetic_lm.utils import get_torch_device, set_seed
from arithmetic_lm.constants import DATA_DIR, ROOT_DIR, CHECKPOINTS_DIR
from arithmetic_lm.dataset import ArithmeticDataset, ArithmeticEvalDataset, LightningArithmeticDataModule
from arithmetic_lm.model.nanogpt import LightningNanoGPT

In [3]:
DEVICE = get_torch_device()
print(f"Using device: {DEVICE}")

set_seed(1337)

Using device: mps


In [4]:
SEQ_LEN = 256
BATCH_SIZE = 32
N_LAYERS = 6
N_HEAD = 6
N_EMBD = 384

## Test one batch overfitting

In [5]:
tokenizer = CharTokenizer()

In [6]:
text = "hello world"
tokens = tokenizer.encode(text)
tokens, "len:", len(tokens), tokenizer.decode(tokens)

([17, 14, 21, 21, 24, 94, 32, 24, 27, 21, 13], 'len:', 11, 'hello world')

In [7]:
# convert to tensor
tokens = torch.tensor(tokens).unsqueeze(0).to(DEVICE)
tokens.shape

torch.Size([1, 11])

### Try overfitting on one batch

In [8]:
# net = NanoGPT(
#     context_len=SEQ_LEN,
#     n_embd=N_EMBD,
#     n_head=N_HEAD,
#     n_layers=N_LAYERS,
#     vocab_size=tokenizer.vocab_size,
# ).to(DEVICE)

In [9]:
# # simplest train loop
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

# # create target by shifting tokens by 1 and adding a padding token at the end
# target = torch.cat([tokens[0, 1:], torch.tensor([65]).to(DEVICE)]).unsqueeze(0)
# test_text = "hel"
# test_tokens = tokenizer.encode(test_text)
# test_tokens = torch.tensor(test_tokens).unsqueeze(0).to(DEVICE)

# losses = []

# for i in range(10000):
#     optimizer.zero_grad()
#     y = net(tokens)
#     # y shape: (batch_size, seq_len, vocab_size)
#     loss = criterion(y.view(-1, y.size(-1)), target.view(-1))
#     loss.backward()
#     optimizer.step()
#     losses.append(loss.item())

#     if i % 100 == 0:
#         print(f"[{i}] loss: {loss.item():.5f}  ", test_text + " -> " + tokenizer.decode(net.generate(test_tokens, max_new_tokens=10).squeeze().tolist()))

In [10]:
# import matplotlib.pyplot as plt

# %matplotlib inline

# plt.plot(losses)
# plt.xlabel("iteration")
# plt.ylabel("loss")

In [11]:
# test_prompt = "hello w"
# tokens = tokenizer.encode(test_prompt)
# tokens = torch.tensor(tokens).unsqueeze(0).to(DEVICE)
# print(tokens.shape)

# net.eval()
# generated_tokens = net.generate(tokens, max_new_tokens=10, temperature=1.0, top_k=1)
# print(generated_tokens.shape)

# tokenizer.decode(generated_tokens.squeeze(0).cpu().tolist())

## Dataset

In [12]:
train_dataset = DATA_DIR / "add_3digit_bal" / "add_3digit_10k_bal.txt"
test_dataset = DATA_DIR / "add_3digit_bal" / "add_3digit_10k_test.txt"

# 10k balanced dataset
train_val_ds = ArithmeticDataset(
    train_dataset, tokenizer=tokenizer, seq_len=SEQ_LEN
)

# test dataset
test_ds = ArithmeticEvalDataset(test_dataset, tokenizer=tokenizer, seq_len=SEQ_LEN)

print("train + val:", len(train_val_ds), "sequences")
print("test:", len(test_ds), "examples")

ldm = LightningArithmeticDataModule(
    train_val_ds, test_ds, BATCH_SIZE, val_ratio=0.2
)
del train_val_ds

train + val: 468 sequences
test: 10000 examples


In [13]:
for batch in ldm.test_dataloader():
    x, y = batch[0]
    break

## Lightning module wrapper for model

In [14]:
lmodel = LightningNanoGPT(
    context_len=SEQ_LEN,
    n_embd=N_EMBD,
    n_head=N_HEAD,
    n_layers=N_LAYERS,
    vocab_size=tokenizer.vocab_size,
    dropout=0.2,
    lr=0.001,
    betas=(0.9, 0.99),
    weight_decay=0.1,
    warmup_iters=100,
)

In [15]:
x, lmodel.generate(x.unsqueeze(0), max_new_tokens=5).squeeze().tolist()

(tensor([96,  6,  5,  4, 72,  1,  1,  4, 80]), [80, 80, 80, 80, 80])

In [None]:
run_name = "nanogpt_add_3digit_10k_bal_with_lr_sched"
run_dir = CHECKPOINTS_DIR / run_name
run_dir.mkdir(exist_ok=True, parents=True)
checkpoint_callback = L.pytorch.callbacks.ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min", dirpath=run_dir, filename="{step}-{train_loss:.4f}-{val_loss:.4f}")
trainer = L.Trainer(
    logger=L.pytorch.loggers.WandbLogger(project="msc-thesis-pilot", name=run_name, save_dir=ROOT_DIR, log_model=True),
    callbacks=[checkpoint_callback],
    max_steps=1000,
    val_check_interval=10,
    log_every_n_steps=1,
    gradient_clip_val=1.0,
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
# trainer.fit(lmodel, ldm)

## Testing