In [1]:
import torch
import cs336_basics
from cs336_basics.model import BasicsTransformerLM
from cs336_basics.data import get_batch
from cs336_basics.optimizer import AdamW
from cs336_basics.nn_utils import cross_entropy

In [None]:
VOCAB_SIZE = 10_000
ROPE_THETA = 10_000
BATCH_SIZE = 4
CONTEXT_LENGTH = 256
D_MODEL = 768
D_FF = 3072
NUM_LAYERS = 12
NUM_HEADS = 12

WARMUP_STEPS = 5


In [None]:

model = BasicsTransformerLM(
    vocab_size = VOCAB_SIZE,
    context_length = CONTEXT_LENGTH,
    d_model = D_MODEL,
    num_layers = NUM_LAYERS,
    num_heads = NUM_HEADS,
    d_ff = D_FF,
    rope_theta = ROPE_THETA,
)
model.to("cuda:0");

In [10]:
import numpy as np
# np_file = "../data/ts_valid.npy"
# dataset = np.load(np_file)
dataset = np.random.randint(0, VOCAB_SIZE, 1024)


In [12]:
import timeit
from functools import partial

def train_step(do_backward=False):
    x, y = get_batch(
        dataset, BATCH_SIZE, CONTEXT_LENGTH, "cuda" 
    )
    y_hat = model(x)
    if do_backward:
        optimizer.zero_grad()
        loss = cross_entropy(y_hat, y)
        loss.backward()
        optimizer.step()
    torch.cuda.synchronize()

def run_test(warmup_steps, train_steps, do_backward):
    for _ in range(warmup_steps):
        train_step()

    train_step_ = partial(train_step, do_backward=do_backward)
    elapsed = timeit.timeit(train_step_, number=train_steps)
    print(f"Time for {train_steps} training step: {elapsed:.6f} seconds")

In [14]:
run_test(5, 10, True)

Time for 10 training step: 1.697123 seconds


In [76]:
import torch
from torch import nn

class ToyModel(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.fc1 = nn.Linear(in_features, 10, bias=False)
        self.ln = nn.LayerNorm(10)
        self.fc2 = nn.Linear(10, out_features, bias=False)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.ln(x)
        x = self.fc2(x)
        return x

INPUT_SIZE = 3
OUTPUT_SIZE = 4
BATCH_SIZE = 4
model = ToyModel(INPUT_SIZE, OUTPUT_SIZE).cuda()

optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

loss_fn = nn.MSELoss()

x = torch.randn(BATCH_SIZE, INPUT_SIZE).cuda()
y = torch.randn(BATCH_SIZE, OUTPUT_SIZE).cuda()


DTYPE = torch.float16
with torch.autocast(device_type="cuda", dtype=DTYPE):
    x_after_fc1 = model.fc1(x)
    print(x_after_fc1.dtype)
    # loss = loss_fn(y, y_pred)

    # for name, param in model.named_parameters():
    #     print(name, param.dtype)
# optimizer.zero_grad()
# loss.backward()
# optimizer.step()


torch.float16


In [70]:
for name, param in model.named_parameters():
    print(name, param.dtype, param.grad.dtype)

fc1.weight torch.float32 torch.float32
ln.weight torch.float32 torch.float32
ln.bias torch.float32 torch.float32
fc2.weight torch.float32 torch.float32


In [None]:
for name, param in model.named_parameters():
    print(name, param.dtype, param.grad.dtype)

fc1.weight torch.float32 torch.float32
ln.weight torch.float32 torch.float32
ln.bias torch.float32 torch.float32
fc2.weight torch.float32 torch.float32
