> load cifar10 dataset to pytorch, use constant BATCH_SIZE and number of workers

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms

# Constants
BATCH_SIZE = 16
NUM_WORKERS = 2

# Transformations
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Load CIFAR10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=NUM_WORKERS)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE,
                                         shuffle=False, num_workers=NUM_WORKERS)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# (trainset, trainloader, testset, testloader, classes)
print("CIFAR10 train:", len(trainset), "test:", len(testset));

In [None]:
import torch.nn as nn
import torch.nn.functional as F

# Define a simple CNN model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

        #init Conv2d
        nn.init.kaiming_uniform_(self.conv1.weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.conv2.weight, nonlinearity='relu')

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

> create training loop with Ignite, adamw optimizer, 1e-4 learning rate

In [None]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.handlers import ModelCheckpoint
from torch.optim import AdamW
import time

# Instantiate the model, loss function, and optimizer
model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-3)

# Create Ignite trainer and evaluator
trainer = create_supervised_trainer(model, optimizer, criterion, device='cpu')
evaluator = create_supervised_evaluator(model, metrics={'accuracy': Accuracy(), 'loss': Loss(criterion)}, device='cpu')

# Attach event handlers to log training progress
@trainer.on(Events.ITERATION_COMPLETED(every=2000))
def log_training_loss(engine):
    print(f"Epoch[{engine.state.epoch}] Iteration[{engine.state.iteration}] Loss: {engine.state.output:.2f}")

@trainer.on(Events.EPOCH_STARTED)
def start_epoch_timer(engine):
    engine.state.epoch_start_time = time.time()

@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(engine):
    evaluator.run(trainloader)
    metrics = evaluator.state.metrics
    print(f"Training Results - Epoch: {engine.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}")
    # Log time elapsed
    epoch_duration = time.time() - engine.state.epoch_start_time
    print(f"Time elapsed for epoch {engine.state.epoch}: {epoch_duration:.1f} seconds")

@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
    evaluator.run(testloader)
    metrics = evaluator.state.metrics
    print(f"Validation Results - Epoch: {engine.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}")

@trainer.on(Events.EPOCH_COMPLETED)
def save_checkpoint(engine):
    # Create a checkpoint handler
    checkpoint_handler = ModelCheckpoint(
    dirname='./checkpoints',
    filename_prefix='cifar10',
    n_saved=2,
    create_dir=True,
    require_empty=False,
    atomic=True,
    include_self=True
    )
    checkpoint_handler(engine, {'model': model, 'optimizer': optimizer, 'trainer': trainer})

@trainer.on(Events.COMPLETED)
def save_final_model(engine):
    torch.save(model.state_dict(), 'cifar10.pth')
    
# Run the training loop
trainer.run(trainloader, max_epochs=10)

Epoch[1] Iteration[2000] Loss: 1.56
Training Results - Epoch: 1  Avg accuracy: 0.53 Avg loss: 1.31
Time elapsed for epoch 1: 98.0 seconds
Validation Results - Epoch: 1  Avg accuracy: 0.52 Avg loss: 1.35
Epoch[2] Iteration[4000] Loss: 1.01
Epoch[2] Iteration[6000] Loss: 1.25
Training Results - Epoch: 2  Avg accuracy: 0.60 Avg loss: 1.12
Time elapsed for epoch 2: 93.1 seconds
Validation Results - Epoch: 2  Avg accuracy: 0.57 Avg loss: 1.21
Epoch[3] Iteration[8000] Loss: 1.45
Training Results - Epoch: 3  Avg accuracy: 0.61 Avg loss: 1.11
Time elapsed for epoch 3: 94.2 seconds
Validation Results - Epoch: 3  Avg accuracy: 0.57 Avg loss: 1.22
Epoch[4] Iteration[10000] Loss: 0.62
Epoch[4] Iteration[12000] Loss: 0.92
Training Results - Epoch: 4  Avg accuracy: 0.65 Avg loss: 0.99
Time elapsed for epoch 4: 96.5 seconds
Validation Results - Epoch: 4  Avg accuracy: 0.60 Avg loss: 1.14
Epoch[5] Iteration[14000] Loss: 1.00
Training Results - Epoch: 5  Avg accuracy: 0.67 Avg loss: 0.93
Time elapsed f

State:
	iteration: 31250
	epoch: 10
	epoch_length: 3125
	max_epochs: 10
	output: 0.7039793729782104
	batch: <class 'list'>
	metrics: <class 'dict'>
	dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
	seed: <class 'NoneType'>
	times: <class 'dict'>
	epoch_start_time: 1725036022.1734416