In [1]:
import os
import math
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, Dataset
import lightning as L
import finalnlp
from finalnlp.gpt.gpt_model import GPT
from finalnlp.replacer import replace_linears_in_pytorch_model
from finalnlp import bitnet1
from finalnlp import bitnet158
from lightning.pytorch import loggers as pl_loggers
from finalnlp.gpt.nb.share import LitGPT, SortDataset, eval_split, length, num_digits, train_dataset, test_dataset, model_type, vocab_size, block_size, val_check_interval, max_steps
%load_ext autoreload

Seed set to 42
Seed set to 42


In [2]:
# print an example instance of the dataset
x, y = train_dataset[0]
for a, b in zip(x,y):
    print(int(a),int(b))

1 -1
2 -1
2 -1
2 -1
2 -1
3 -1
0 -1
3 0
0 1
1 2
2 2
2 2
2 2
2 3
3 3


## BitLinear-1B

In [3]:
# autoencoder = LitAutoEncoder(Encoder(), Decoder())
L.seed_everything(42, workers=True)
model_config = GPT.get_default_config()
model_config.model_type = model_type
model_config.vocab_size = vocab_size
model_config.block_size = block_size
model = LitGPT(model_config, train_dataset.length, linear_replacer=bitnet1.BitLinear1B)

wandb_logger = pl_loggers.WandbLogger("GPT-Sort-Problem-BitNet1B-len" + str(length) + str(model_type))
wandb_logger.experiment.config.update(model_config)
wandb_logger.experiment.config.update({"problem": "sort", "linear_replacer": "bitnet1b"})
wandb_logger.experiment.config.update({"length": length, "num_digits": num_digits})

trainer = L.Trainer(
    # callbacks=[EarlyStopping(monitor="train_loss", mode="min")],
    logger=wandb_logger,
    max_steps=max_steps,
    val_check_interval=val_check_interval,
)
wandb_logger.watch(model)
torch.set_float32_matmul_precision('medium')
trainer.fit(
    model=model,
    train_dataloaders=DataLoader(train_dataset, num_workers=15),
    val_dataloaders=DataLoader(test_dataset, num_workers=15),    
)

Seed set to 42


number of parameters: 0.09M


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcandrewlee14[0m ([33mandrews-org[0m). Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type | Params
-------------------------------
0 | model | GPT  | 86.0 K
-------------------------------
86.0 K    Trainable params
0         Non-trainable params
86.0 K    Total params
0.344     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=10000` reached.


In [4]:
trainer.validate(model=model, dataloaders=DataLoader(test_dataset, num_workers=15))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc            0.5879999995231628
        val_loss            0.11658169329166412
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss': 0.11658169329166412, 'val_acc': 0.5879999995231628}]

In [5]:
# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(model, train_dataset, test_dataset, trainer, 'train', max_batches=50)
    test_score  = eval_split(model, train_dataset, test_dataset, trainer, 'test',  max_batches=50)

GPT claims that [3, 3, 1, 2, 3, 2, 0, 0] sorted is [0, 0, 1, 2, 3, 3, 3, 3] but gt is [0, 0, 1, 2, 2, 3, 3, 3]
GPT claims that [1, 1, 1, 2, 1, 3, 0, 2] sorted is [0, 1, 1, 1, 2, 2, 3, 3] but gt is [0, 1, 1, 1, 1, 2, 2, 3]
GPT claims that [2, 2, 1, 2, 1, 0, 2, 0] sorted is [0, 0, 1, 2, 2, 2, 2, 2] but gt is [0, 0, 1, 1, 2, 2, 2, 2]
train final score: 119/250 = 47.60% correct


TypeError: object of type 'Trainer' has no len()

In [6]:
# let's run a random given sequence through the model as well
n = train_dataset.length
inp = torch.tensor([[0, 0, 2, 1, 0, 1]], dtype=torch.long)
assert inp[0].nelement() == n
with torch.no_grad():
    cat = model.model.generate(inp, n, do_sample=False)
sol = torch.sort(inp[0])[0]
sol_candidate = cat[:, n:]
print('input sequence  :', inp.tolist())
print('predicted sorted:', sol_candidate.tolist())
print('gt sort         :', sol.tolist())
print('matches         :', bool((sol == sol_candidate).all()))

AssertionError: 