In [None]:
import csv
import time
import torch
import random
import numpy as np
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import os
import shlex
import subprocess
from torchvision import datasets
from torch.profiler import profile, ProfilerActivity
from torch.utils.data import DataLoader, Subset
from torchvision.models.vision_transformer import vit_b_16
from torchvision.models.vision_transformer import VisionTransformer

In [2]:
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
print(torch.cuda.is_available())

2.9.0a0+gitcbe1a35
12.4
None
True


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
BATCH_SIZE = 64
NUM_EPOCHS = 200
LEARNING_RATE = 3e-4
NUM_WORKERS = 4
IMAGE_SIZE = 64
PATCH_SIZE = 8
HEAD_SIZE = 100
NUM_TRAINING_IMAGES = 40000
NUM_VAL_IMAGES = 10000
NUM_TESTING_IMAGES = 10000

VIT_MODEL = "CUSTOM"
# VIT_MODEL = "BUILT-IN"

# OPTIMIZER = "MUON"
OPTIMIZER = "ADAMW"

BENCHMARKING_DIRECTORY = "./results/convergence/original/large/"
MODEL_DIRECTORY = "./models/original/large/"
RESULT_FILENAME = "1"

cuda


In [None]:
if(VIT_MODEL == "CUSTOM"):
    # model = VisionTransformer(
    #     image_size=IMAGE_SIZE,
    #     patch_size=PATCH_SIZE,
    #     num_layers=8,
    #     hidden_dim=384,
    #     num_heads=6,
    #     mlp_dim=1536,
    #     num_classes=100,
    # )
    
    model = VisionTransformer(
        image_size=IMAGE_SIZE,
        patch_size=PATCH_SIZE,
        num_layers=12,
        num_heads=6,
        hidden_dim=384,
        mlp_dim=1536,
        dropout=0.1,
        attention_dropout=0.1,
        num_classes=100
    )

    # model = VisionTransformer(
    #     image_size=IMAGE_SIZE,
    #     patch_size=PATCH_SIZE,
    #     num_layers=24,
    #     num_heads=16,
    #     hidden_dim=1024,
    #     mlp_dim=4096,
    #     dropout=0.1,
    #     attention_dropout=0.1,
    #     num_classes=100
    # )

else:
  # model = vit_b_16(pretrained=False)
  model = vit_b_16(weights="IMAGENET1K_V1")

In [5]:
# code for consolidated seeding
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

split_generator = torch.Generator().manual_seed(seed)
trainloader_generator = torch.Generator().manual_seed(seed)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    torch.manual_seed(worker_seed)

In [6]:
cifar100_mean = [0.5071, 0.4867, 0.4408]
cifar100_std  = [0.2675, 0.2565, 0.2761]

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(IMAGE_SIZE, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=cifar100_mean, std=cifar100_std),
])

val_transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=cifar100_mean, std=cifar100_std),
])

test_transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=cifar100_mean, std=cifar100_std),
])

In [7]:
trainset = datasets.CIFAR100(root='./data', train=True, download=True, transform=None)

train_indices, val_indices = torch.utils.data.random_split(
    list(range(len(trainset))),
    [NUM_TRAINING_IMAGES, NUM_VAL_IMAGES],
    generator=split_generator
)

train_subset = Subset(
    datasets.CIFAR100(root="./data", train=True, transform=train_transform),
    train_indices.indices
)

val_subset = Subset(
    datasets.CIFAR100(root="./data", train=True, transform=val_transform),
    val_indices.indices
)

trainloader = DataLoader(train_subset, batch_size=BATCH_SIZE,
                                        shuffle=True, num_workers=NUM_WORKERS,
                                        generator=trainloader_generator, worker_init_fn=seed_worker)
valloader = DataLoader(val_subset, batch_size=BATCH_SIZE,
                                           shuffle=False, num_workers=NUM_WORKERS)

testset = datasets.CIFAR100(root='./data', train=False,
                                        download=True, transform=test_transform)
test_subset = Subset(testset, range(NUM_TESTING_IMAGES))
testloader = DataLoader(test_subset, batch_size=BATCH_SIZE,
                                         shuffle=False, num_workers=NUM_WORKERS)

dataiter = iter(trainloader)
images, labels = next(dataiter)
print(images.shape)
print(labels[:5])

torch.Size([64, 3, 64, 64])
tensor([33, 25, 59, 34, 25])


In [8]:
model.heads.head = nn.Linear(model.heads.head.in_features, HEAD_SIZE)
model = model.to(device)

criterion = nn.CrossEntropyLoss()

# Initial code uses single optimizer only. Can be uncommented if needed.
# match OPTIMIZER:
#   case "MUON":
#     optimizer = optim.Muon(model.parameters(), lr=LEARNING_RATE)
#   case default:
#     optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [9]:
params_2d = []
params_other = []

for name, param in model.named_parameters():
    if param.requires_grad:
        if param.ndim == 2:
            params_2d.append(param)
        else:
            params_other.append(param)

optimizer_muon = optim.Muon(params_2d, lr=LEARNING_RATE, weight_decay=0.00)
optimizer_adamw = optim.AdamW(params_other, lr=LEARNING_RATE, weight_decay=0.05)

In [None]:
# pid = os.getpid()
# log_path = os.path.join(BENCHMARKING_DIRECTORY, f"{RESULT_FILENAME}.log")
# quoted_log_path = shlex.quote(log_path)

# print(f"PID: {pid}")
# print(f"Logging GPU utilization to: {quoted_log_path}")

In [None]:
# !nohup nvidia-smi dmon -s um -d 1 > {quoted_log_path} 2>&1 &

In [None]:
# csv_file = open(BENCHMARKING_DIRECTORY + RESULT_FILENAME + ".csv", "w", newline="")
# writer = csv.writer(csv_file)
# writer.writerow(["Epoch", "Kernel", "CUDA time (us)", "CPU time (us)", "Memory Usage (bytes)", "Launch Count"])

# benchmarking_csv = open(BENCHMARKING_DIRECTORY + RESULT_FILENAME + ".csv", "w", newline="")
# benchmarking_writer = csv.writer(benchmarking_csv)
# benchmarking_writer.writerow(["Epoch", "Step Avg Time", "Overall Step Time", "# Calls", "Epoch Time", "Training Loss", "Validation Loss", "Validation Accuracy"])

scheduler_muon = optim.lr_scheduler.CosineAnnealingLR(optimizer_muon, T_max=NUM_EPOCHS)
scheduler_adamw = optim.lr_scheduler.CosineAnnealingLR(optimizer_adamw, T_max=NUM_EPOCHS)

best_loss = float('inf')
best_val_acc = 0
patience = 5
trigger_times = 0

iteration = 0
start_time = time.time()
for epoch in range(NUM_EPOCHS):
    epoch_start_time = time.time()
    print("Epoch {}/{}".format(epoch + 1, NUM_EPOCHS))

    muon_step_total_time = 0.0
    muon_step_calls = 0

    model.train()
    running_loss = 0
    for images, labels in trainloader:
        images, labels = images.to(device), labels.to(device)

        optimizer_muon.zero_grad()
        optimizer_adamw.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()

        # # NOTE: UPDATED THIS
        # with profile(
        #     activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        #     record_shapes=False,
        #     profile_memory=True,
        #     with_stack=False
        # ) as prof:
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()
        optimizer_muon.step()
        end_event.record()
        torch.cuda.synchronize()
        muon_step_total_time += start_event.elapsed_time(end_event)
        # t0 = time.perf_counter()
        # optimizer_muon.step()
        # t1 = time.perf_counter()

        # muon_step_total_time += (t1 - t0)
        muon_step_calls += 1

        optimizer_adamw.step()

        # for item in prof.key_averages():
        #   writer.writerow([
        #       epoch + 1,
        #       item.key,
        #       item.self_device_time_total,
        #       item.self_cpu_time_total,
        #       item.self_device_memory_usage,
        #       item.count
        #     ])
        # csv_file.flush()

        running_loss += loss.item()
    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_start_time
    avg_train_loss = running_loss / len(trainloader)
    avg_muon_step = muon_step_total_time / muon_step_calls
    print(f"[Epoch {epoch+1}] Muon optimizer step avg time: {avg_muon_step:.4f} ms "
          f"(calls: {muon_step_calls} | Muon step time: {muon_step_total_time:.3f}")

    model.eval()
    val_loss = 0
    total_val = 0
    correct_val = 0
    with torch.no_grad():
        for images, labels in valloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct_val += (preds == labels).sum().item()
            total_val += labels.size(0)

    avg_val_loss = val_loss / len(valloader)
    val_acc = 100 * correct_val / total_val

    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f} | Epoch Time: {epoch_time:.6f}")

    # benchmarking_writer.writerow([
    #     epoch + 1,
    #     avg_muon_step,
    #     muon_step_total_time,
    #     muon_step_calls,
    #     epoch_time,
    #     avg_train_loss,
    #     avg_val_loss,
    #     val_acc])
    # benchmarking_csv.flush()

    scheduler_muon.step()
    scheduler_adamw.step()

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        trigger_times = 0
        torch.save(model.state_dict(), MODEL_DIRECTORY + RESULT_FILENAME + ".pt")
    else:
        print(f"No improvement for {trigger_times + 1} epoch(s)")
        trigger_times += 1

    if trigger_times >= patience:
      print(f"Early stopping at epoch {epoch+1}. Best Val Accuracy: {best_val_acc:.4f}")
      break

end_time = time.time()
runtime = end_time - start_time
print(f"Start Time: {start_time}")
print(f"End Time: {end_time}")
print(f"Runtime: {runtime:.6f} seconds")

# benchmarking_writer.writerow([
#         "Total time:",
#         f"{runtime:.6f}",
#         None,
#         None,
#         None,
#         None,
#         None,
#         None])
# benchmarking_csv.flush()

# csv_file.close()

Epoch 1/200
[Epoch 1] Muon optimizer step avg time: 73.9566 ms (calls: 625 | Muon step time: 46222.862
Epoch [1/200] Train Loss: 3.8128 | Val Loss: 3.3938 | Val Acc: 19.1200 | Epoch Time: 365.206164
Epoch 2/200
[Epoch 2] Muon optimizer step avg time: 73.7513 ms (calls: 625 | Muon step time: 46094.561
Epoch [2/200] Train Loss: 3.2115 | Val Loss: 2.9908 | Val Acc: 26.1700 | Epoch Time: 365.540711
Epoch 3/200
[Epoch 3] Muon optimizer step avg time: 73.6945 ms (calls: 625 | Muon step time: 46059.079
Epoch [3/200] Train Loss: 2.8449 | Val Loss: 2.7251 | Val Acc: 31.3800 | Epoch Time: 365.406705
Epoch 4/200
[Epoch 4] Muon optimizer step avg time: 73.6864 ms (calls: 625 | Muon step time: 46054.029
Epoch [4/200] Train Loss: 2.5787 | Val Loss: 2.5506 | Val Acc: 34.5800 | Epoch Time: 365.900679
Epoch 5/200
[Epoch 5] Muon optimizer step avg time: 73.6861 ms (calls: 625 | Muon step time: 46053.824


In [None]:
# !pkill -f "nvidia-smi dmon"

In [None]:
model.load_state_dict(torch.load(MODEL_DIRECTORY + RESULT_FILENAME + ".pt", map_location=device))
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for images, labels in testloader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

acc = 100 * correct / total
print(f"Test Accuracy: {acc:.2f}%")

In [None]:
# benchmarking_writer.writerow([
#         "Total Accuracy:",
#         f"{acc:.2f}",
#         None,
#         None,
#         None,
#         None,
#         None,
#         None])
# benchmarking_csv.flush()

# benchmarking_csv.close()