In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import random_split
import lightning as L
import tqdm

torch.manual_seed(10)

<torch._C.Generator at 0x1e2e0856bf0>

In [7]:
# Define the model
class SmallResNet(nn.Module):
    def __init__(self, num_classes=10):
        super(SmallResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(16, 16, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(16 * 32 * 32, num_classes)

    def forward(self, x):
        out = self.conv1(x)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.relu(out)
        out = out.flatten(1)
        out = self.fc(out)
        return out

In [9]:
def get_dataloaders():
    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
    )

    dataset = torchvision.datasets.CIFAR10(
        root="./data", train=True, download=True, transform=transform
    )
    test_set = torchvision.datasets.CIFAR10(
        root="./data", train=False, download=True, transform=transform
    )

    val_size = 5000
    train_size = len(dataset) - val_size
    train_set, val_set = random_split(dataset, [train_size, val_size])

    train_loader = torch.utils.data.DataLoader(
        train_set, batch_size=128, shuffle=True, num_workers=0
    )
    val_loader = torch.utils.data.DataLoader(
        val_set, batch_size=128, shuffle=False, num_workers=0
    )
    test_loader = torch.utils.data.DataLoader(
        test_set, batch_size=128, shuffle=False, num_workers=0
    )
    
    return train_loader, val_loader, test_loader

Files already downloaded and verified
Files already downloaded and verified


In [11]:
fabric = L.Fabric(accelerator="auto", devices="auto", num_nodes=2, strategy="ddp_notebook")
fabric.launch()

# Initialize the model and optimizer
model = SmallResNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

model, optimizer = fabric.setup(model, optimizer)
train_loader, val_loader = fabric.setup_dataloaders(train_loader)

# Training loop
for epoch in range(10):
    running_loss = 0.0
    for i, data in tqdm(enumerate(train_loader)):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        fabric.backward(loss)
        optimizer.step()

        running_loss += loss.item()
        if i % 200 == 199:
            print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / 200))
            running_loss = 0.0

print("Finished training")

ValueError: You selected `Fabric(strategy='ddp_notebook')` but process forking is not supported on this platform. We recommed `Fabric(strategy='ddp_spawn')` instead.