In [None]:
from transformers import GPT2LMHeadModel

In [4]:
model_hf = GPT2LMHeadModel.from_pretrained('gpt2')
sd_hf = model_hf.state_dict()

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from model import GPT, GPTConfig

In [3]:
import sys

sys.path.append("/home/htkumar/llms/gpt2_karpathy")

In [None]:
model = GPT(GPTConfig(vocab_size=50304))  # power of 2 is better for the GPUs
model.to("cuda:0")
model = torch.compile(model)
model

In [None]:
a = torch.load("/home/htkumar/llms/gpt2_karpathy/logs/model_00000.pt")
a.keys()

In [None]:
a["model"].keys()

In [None]:
model.load_state_dict(a["model"])

In [None]:
model.eval()

In [None]:
a = torch.tensor([[1, 2, 3], [4, 5, 6]])
a.shape

In [None]:
a[0:]

In [None]:
a[0]

In [None]:
a[:, 1:]

In [None]:
torch.cuda.is_available()

In [None]:
torch.cuda.is_bf16_supported()

In [None]:
torch.tril(torch.ones(8, 8)).view(1, 1, 8, 8)

In [None]:
x = torch.arange(12).view(2, 2, 3)
x.shape

In [None]:
q, k, v = x.chunk(3, dim=2)
q.shape, k.shape, v.shape

In [None]:
q, k, v = x.split(1, dim=2)
q.shape, k.shape, v.shape

In [None]:
module_dict = nn.ModuleDict(
    dict(
        module_1=nn.Linear(10, 10),
        module_2=nn.Linear(10, 10),
    )
)

In [None]:
module_dict.keys()

In [None]:
q.shape

In [None]:
q.shape[::-1]

In [None]:
a = torch.arange(6).view(2, 3)
a.shape

In [None]:
a.t().shape

In [None]:
import tiktoken

In [None]:
a = torch.arange(12).view(3, 4).double()
a.shape, a.dtype

In [None]:
a = F.softmax(a, dim=1)
a

In [None]:
topk_probs, topk_indices = torch.topk(a, 2, dim=1)

In [None]:
topk_probs

In [None]:
topk_indices

In [None]:
b = torch.multinomial(topk_probs, 1)
b

In [None]:
b.shape

In [None]:
xcol = torch.gather(topk_indices, -1, b)
xcol

In [None]:
t = torch.tensor([[1, 2], [3, 4]])
torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]]))

In [None]:
torch.gather(t, 0, torch.tensor([[0, 0], [1, 0]]))

In [None]:
a = torch.tensor(1)
a.shape

In [None]:
a.item()

In [None]:
a = torch.tensor([1, 2, 3])
len(a)

In [None]:
import time

t0 = time.time()
t0

In [None]:
t1 = time.time()

In [None]:
(t1 - t0)

In [None]:
time.time??

In [None]:
import math

In [None]:
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 10
max_steps = 50


def get_lr(it):
    # linear warmup
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    elif it > max_steps:
        return min_lr
    # cosine decay
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

In [None]:
iters = list(range(100))

In [None]:
lrs = [get_lr(it) for it in iters]

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.style.use("_mpl-gallery")

In [None]:
plt.plot(iters, lrs, linewidth=2.0)

In [None]:
import os

os.getcwd()

In [None]:
from train_gpt2 import GPT, GPTConfig

model = GPT(GPTConfig(vocab_size=50304))

In [None]:
optimizer = model.configure_optimizers(
    weight_decay=0.1, learning_rate=3e-4, device="cuda"
)

In [None]:
optimizer.param_groups[0].keys()

In [None]:
len(optimizer.param_groups[1]["params"])
# same as number of parameters in this group

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), eps=1e-9)

In [None]:
optimizer.param_groups[0]["lr"]

In [None]:
import inspect

In [None]:
inspect.signature(torch.optim.AdamW).parameters

In [None]:
params_dict = {pn: p for pn, p in model.named_parameters()}
params_dict["transformer.wte.weight"].requires_grad
len(params_dict)

In [None]:
params_dict = {pn: p for pn, p in params_dict.items() if p.requires_grad}
len(params_dict)

In [None]:
decay_params = [p for n, p in params_dict.items() if p.dim() >= 2]
nondecay_params = [p for n, p in params_dict.items() if p.dim() < 2]
len(decay_params), len(nondecay_params)

In [None]:
num_decay_params = sum(p.numel() for p in decay_params)
num_nondecay_params = sum(p.numel() for p in nondecay_params)

In [None]:
print(num_decay_params, num_nondecay_params)

In [None]:
# fineweb edu dataset processing

In [None]:
import multiprocessing as mp
import os

import numpy as np
import tiktoken
from datasets import load_dataset
from tqdm import tqdm

In [None]:
remote_name = "sample-10BT"  # these are 10B gpt2 tokens sampled from the whole dataset

In [None]:
# fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train")

In [None]:
max_value = np.iinfo(np.uint16).max
max_value

In [None]:
np.iinfo(np.uint16).min

In [None]:
a = np.array([2**16 - 1])
a.astype(np.uint16)

In [None]:
os.path.join(os.path.dirname(__file__), "test")

In [None]:
os.cpu_count() // 2

In [None]:
loss = torch.tensor(3.1)

In [None]:
loss.detach()