In [31]:
import torch
import cs336_basics
from cs336_basics.model import BasicsTransformerLM
from cs336_basics.data import get_batch
from cs336_basics.optimizer import AdamW
from cs336_basics.nn_utils import cross_entropy

In [29]:
VOCAB_SIZE = 10_000
ROPE_THETA = 10_000
BATCH_SIZE = 4
CONTEXT_LENGTH = 256
D_MODEL = 768
D_FF = 3072
NUM_LAYERS = 12
NUM_HEADS = 12

WARMUP_STEPS = 5


In [21]:
model = BasicsTransformerLM(
    vocab_size = VOCAB_SIZE,
    context_length = CONTEXT_LENGTH,
    d_model = D_MODEL,
    num_layers = NUM_LAYERS,
    num_heads = NUM_HEADS,
    d_ff = D_FF,
    rope_theta = ROPE_THETA,
)
model.to("cuda:0");

In [24]:
optimizer = AdamW(model.parameters())

In [None]:
import numpy as np
np_file = "../data/ts_valid.npy"
dataset = np.load(np_file)


In [35]:
import timeit

def train_step():
    x, y = get_batch(
        dataset, BATCH_SIZE, CONTEXT_LENGTH, "cuda" 
    )
    optimizer.zero_grad()
    y_hat = model(x)
    loss = cross_entropy(y_hat, y)
    loss.backward()
    optimizer.step()
    torch.cuda.synchronize()

for _ in range(WARMUP_STEPS):
    train_step()

TRAIN_STEPS = 10
elapsed = timeit.timeit(train_step, number=TRAIN_STEPS)
print(f"Time for {TRAIN_STEPS} training step: {elapsed:.6f} seconds")

Time for 10 training step: 1.573941 seconds


In [41]:
import timeit
from functools import partial

def train_step(do_backward=False):
    x, y = get_batch(
        dataset, BATCH_SIZE, CONTEXT_LENGTH, "cuda" 
    )
    y_hat = model(x)
    if do_backward:
        optimizer.zero_grad()
        loss = cross_entropy(y_hat, y)
        loss.backward()
        optimizer.step()
    torch.cuda.synchronize()

for _ in range(WARMUP_STEPS):
    train_step()

TRAIN_STEPS = 10
train_step_ = partial(train_step, do_backward=True)
elapsed = timeit.timeit(train_step_, number=TRAIN_STEPS)
print(f"Time for {TRAIN_STEPS} training step: {elapsed:.6f} seconds")

Time for 10 training step: 1.569615 seconds
