In [None]:
from transformers import GPT2LMHeadModel

In [None]:
import os

import matplotlib.pyplot as plt

import tiktoken

import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from model import GPT, GPTConfig, MoeArgs

%matplotlib inline

In [None]:
device = "cuda"
experiment_id = "base_gpt2"
LOGS_DIR = f"/home/htkumar/llms/gpt2_karpathy/logs_{experiment_id}"
num_epochs = 1
step = 19073 * num_epochs - 1

In [None]:
model = GPT(GPTConfig(vocab_size=50304))
model.to(device)
model = torch.compile(model)
checkpoint_file = torch.load(os.path.join(LOGS_DIR, f"model_{step:05d}.pt"))
model.load_state_dict(checkpoint_file["model"])

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [None]:
def generate_from_model(model):
    # generate from the model
    num_return_sequences = 5
    max_length = 32
    enc = tiktoken.get_encoding("gpt2")
    tokens = enc.encode("Hello, I'm a language model,")
    tokens = [15496, 11, 314, 1101, 257, 3303, 2746, 11]
    tokens = torch.tensor(tokens, dtype=torch.long)  # (8,)
    tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)  # (5, 8)
    x = tokens.to(device)

    sample_rng = torch.Generator(device=device)
    sample_rng.manual_seed(42)

    # (B, T)
    while x.size(1) < max_length:
        with torch.no_grad():
            # (B, T, vocab_size)
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                logits, loss = model(x)
            # (B, vocab_size)
            # print(logits.shape)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)

            # (B, 50)
            topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
            new_id = torch.multinomial(topk_probs, num_samples=1)  # (B, 1)
            new_id = torch.gather(topk_indices, -1, new_id)  # (B, 1)
            # (B, T + 1)
            x = torch.cat((x, new_id), dim=-1)

    for i in range(num_return_sequences):
        decoded = enc.decode(x[i].tolist())
        print(f"{i} {decoded}")

In [None]:
def generate_from_hf_model(model):
    # generate from the model
    num_return_sequences = 5
    max_length = 32
    enc = tiktoken.get_encoding("gpt2")
    tokens = enc.encode("Hello, I'm a language model,")
    tokens = [15496, 11, 314, 1101, 257, 3303, 2746, 11]
    tokens = torch.tensor(tokens, dtype=torch.long)  # (8,)
    tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)  # (5, 8)
    x = tokens.to(device)

    sample_rng = torch.Generator(device=device)
    sample_rng.manual_seed(42)

    # (B, T)
    while x.size(1) < max_length:
        with torch.no_grad():
            # (B, T, vocab_size)
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                logits = model(x)[0]
            # (B, vocab_size)
            # print(logits.shape)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)

            # (B, 50)
            topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
            new_id = torch.multinomial(topk_probs, num_samples=1)  # (B, 1)
            new_id = torch.gather(topk_indices, -1, new_id)  # (B, 1)
            # (B, T + 1)
            x = torch.cat((x, new_id), dim=-1)

    for i in range(num_return_sequences):
        decoded = enc.decode(x[i].tolist())
        print(f"{i} {decoded}")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
sd_hf = model_hf.state_dict()
sd_hf.keys()

In [None]:
model_hf.eval()
model_hf.to(device)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
generate_from_hf_model(model_hf)

In [None]:
generate_from_model(model)

In [None]:
print(sd_hf["lm_head.weight"].data_ptr())
print(sd_hf["transformer.wte.weight"].data_ptr())

In [None]:
for k, v in sd_hf.items():
    print(k, v.shape)

In [None]:
sd_hf["transformer.wpe.weight"].shape

In [None]:
sd_hf["transformer.wpe.weight"].view(-1)[:20]

In [None]:
plt.imshow(sd_hf["transformer.wpe.weight"], cmap="gray")

In [None]:
plt.plot(sd_hf["transformer.wpe.weight"][:, 150])

In [None]:
plt.plot(sd_hf["transformer.wpe.weight"][:, 200])

In [None]:
plt.plot(sd_hf["transformer.wpe.weight"][:, 250])

In [None]:
plt.plot(sd_hf["transformer.wpe.weight"][:, 0])

In [None]:
plt.imshow(sd_hf["transformer.h.1.attn.c_attn.weight"][:300, :300], cmap="gray")

In [None]:
# from transformers import pipeline, set_seed

# generator = pipeline("text-generation", model="gpt2")
# set_seed(42)
# generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

In [None]:
# check impact of gradient accumulation

net = nn.Sequential(
    nn.Linear(16, 32),
    nn.GELU(),
    nn.Linear(32, 1),
)

torch.random.manual_seed(42)
x = torch.randn(4, 16)
y = torch.randn(4, 1)
net.zero_grad()
yhat = net(x)

loss = F.mse_loss(yhat, y)
loss.backward()
net[0].weight.grad.view(-1)[:10]

In [None]:
net.zero_grad()
for i in range(4):
    yhat = net(x[i])
    loss = F.mse_loss(yhat, y[i])
    loss /= 4
    loss.backward()

net[0].weight.grad.view(-1)[:10]

In [None]:
# analyze the logfile generated after initial training

In [None]:
# calculate validation loss of gpt2 from hf to serve as baseline
from dataloader import DataLoaderLite

val_dataloader = DataLoaderLite(
    B=16, T=2048, process_rank=0, num_processes=1, split="val"
)

In [None]:
from tqdm import tqdm

In [None]:
# evaluate gpt-2 baseline model
val_dataloader.reset()
with torch.no_grad():
    val_loss_accum = 0.0
    val_loss_steps = 100
    loss_total = 0.0
    for _ in tqdm(range(val_loss_steps), desc="Evaluating validation loss"):
        x, y = val_dataloader.next_batch()
        x, y = x.to(device), y.to(device)
        logits = model_hf(x)[0]
        # print(logits.size()[-1])
        loss = F.cross_entropy(logits.view(-1, logits.size()[-1]), y.view(-1))
        loss_total += loss.detach()

    loss_avg = loss_total / val_loss_steps
    print(loss_avg)

In [None]:
val_dataloader.reset()
with torch.no_grad():
    val_loss_accum = 0.0
    val_loss_steps = 100
    loss_total = 0.0
    for _ in tqdm(range(val_loss_steps), desc="Evaluating validation loss"):
        x, y = val_dataloader.next_batch()
        x, y = x.to(device), y.to(device)
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            logits, loss = model(x, y)
        # print(logits.size()[-1])
        # loss = F.cross_entropy(logits.view(-1, logits.size()[-1]), y.view(-1))
        loss_total += loss.detach()

    loss_avg = loss_total / val_loss_steps
    print(loss_avg)

In [None]:
import numpy as np

sz = "124M"
loss_baseline = {
    "124M": 3.2758,
}[sz]

with open(f"{LOGS_DIR}/log.txt", "r") as f:
    lines = f.readlines()

len(lines)

In [None]:
streams = {}
for line in lines:
    step, stream, val = line.strip().split()
    if stream not in streams:
        streams[stream] = {}
    streams[stream][int(step)] = float(val)

In [None]:
len(streams.keys())

In [None]:
streams_xy = {}
for k, v in streams.items():
    xy = sorted(list(v.items()))
    # print(xy)
    # print(zip(*xy))
    streams_xy[k] = list(zip(*xy))

In [None]:
plt.figure(figsize=(16, 6))

In [None]:
xs, ys = streams_xy["train"]
ys = np.array(ys)
print(f"min train loss {min(ys)}")

xs_val, ys_val = streams_xy["val"]
ys_val = np.array(ys_val)
print(f"min val loss {min(ys_val)}")

In [None]:
plt.plot(xs, ys, label=f"nanogpt {sz} train loss")
plt.plot(xs_val, ys_val, label=f"nanogpt {sz} val loss")

if loss_baseline is not None:
    plt.axhline(
        y=loss_baseline, color="r", linestyle="--", label=f"OpenAI gpt-2 {sz} model"
    )

plt.xlabel("steps")
plt.ylabel("loss")
plt.yscale("log")
plt.ylim(top=4.0)
plt.legend()
plt.title("Loss curve")

In [None]:
len(list(model_hf.parameters()))

In [None]:
len(list(model.parameters()))

In [None]:
type(model_hf)

In [None]:
def get_num_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
model = GPT(GPTConfig(vocab_size=50304))
model.to(device)
model = torch.compile(model)

In [None]:
# 521M for MOE based model, 124M for non-moe based model
get_num_parameters(model)

In [None]:
1024 * 768  # extra pos embeddings brought by increasing sequence length

#### Scratch pad below

In [None]:
a = [(1, 2), (2, 3), (3, 4)]
b = list(zip(*a))
b

In [None]:
a = [1, 2, 3, 4]
b = [2, 3, 4, 5]
for i, j in zip(a, b):
    print(i, j)