<a href="https://colab.research.google.com/github/jamesbaskerville/colabs/blob/main/pytorchLightning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install lightning

Collecting lightning
  Downloading lightning-2.3.3-py3-none-any.whl (808 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m808.5/808.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.11.5-py3-none-any.whl (26 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.3.3-py3-none-any.whl (812 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.3/812.3 kB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch<4.0,>=2.0.0->lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-

In [4]:
import lightning as L
import torch

print("Lightning version:", L.__version__)
print("Torch version:", torch.__version__)
print("CUDA is available:", torch.cuda.is_available())

Lightning version: 2.3.3
Torch version: 2.3.0+cu121
CUDA is available: True


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import lightning as L
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers.tensorboard import TensorBoardLogger
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

In [6]:
L.seed_everything(1121218)

INFO: Seed set to 1121218
INFO:lightning.fabric.utilities.seed:Seed set to 1121218


1121218

In [7]:
num_epochs = 10
batch_size = 64
learning_rate = 0.001

In [8]:
from torchvision import datasets, transforms

# Data augmentation and normalization for training
transform_train = transforms.Compose(
   [
       transforms.RandomCrop(32, padding=4),
       transforms.RandomHorizontalFlip(),
       transforms.ToTensor(),
       transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
   ],
)
transform_test = transforms.Compose(
   [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

# Load CIFAR-10 dataset
train_dataset = datasets.CIFAR10(
   root="./data", train=True, download=True, transform=transform_train
)
val_dataset = datasets.CIFAR10(
   root="./data", train=False, download=True, transform=transform_test
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:10<00:00, 16309790.95it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [10]:
train_loader = DataLoader(
   train_dataset, batch_size=batch_size, shuffle=True, num_workers=8
)
test_loader = DataLoader(
   val_dataset, batch_size=batch_size, shuffle=False, num_workers=8
)

In [11]:
# Training a CIFAR-10 classifier with Classic PyTorch
class CIFAR10CNN(nn.Module):
   def __init__(self):
       super(CIFAR10CNN, self).__init__()
       # 3 conv layers, max pooling, 2 linears
       self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
       self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
       self.conv3 = nn.Conv2d(64, 64, 3, padding=1)
       self.pool = nn.MaxPool2d(2, 2)
       self.fc1 = nn.Linear(64 * 4 * 4, 512)
       self.fc2 = nn.Linear(512, 10)

   def forward(self, x):
       x = self.pool(torch.relu(self.conv1(x)))
       x = self.pool(torch.relu(self.conv2(x)))
       x = self.pool(torch.relu(self.conv3(x)))
       x = x.view(-1, 64 * 4 * 4)
       x = torch.relu(self.fc1(x))
       x = self.fc2(x)
       return x

In [17]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
from torch.utils.tensorboard import SummaryWriter

# Initialize the model, loss function, and optimizer
model = CIFAR10CNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)

# TensorBoard setup
writer = SummaryWriter('runs/cifar10_cnn_experiment')

In [21]:
# Training loop
total_step = len(train_loader)
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_step}], Loss: {loss.item():.4f}')

    # Calculate average training loss for the epoch
    avg_train_loss = train_loss / len(train_loader)
    writer.add_scalar('training loss', avg_train_loss, epoch)

    # Validation
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        val_loss = 0.0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        avg_val_loss = val_loss / len(test_loader)
        print(f'Validation Accuracy: {accuracy:.2f}%')
        writer.add_scalar('validation loss', avg_val_loss, epoch)
        writer.add_scalar('validation accuracy', accuracy, epoch)

    # Learning rate scheduling
    scheduler.step(avg_val_loss)

# Final test
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Test Accuracy: {100 * correct / total:.2f}%')

writer.close()

# Save the model
torch.save(model.state_dict(), 'cifar10_cnn.pth')

  self.pid = os.fork()


Epoch [1/10], Step [100/782], Loss: 2.0440
Epoch [1/10], Step [200/782], Loss: 1.5101
Epoch [1/10], Step [300/782], Loss: 1.5574
Epoch [1/10], Step [400/782], Loss: 1.5169
Epoch [1/10], Step [500/782], Loss: 1.3750
Epoch [1/10], Step [600/782], Loss: 1.3035
Epoch [1/10], Step [700/782], Loss: 1.3609
Validation Accuracy: 51.35%
Epoch [2/10], Step [100/782], Loss: 0.9979
Epoch [2/10], Step [200/782], Loss: 1.3246
Epoch [2/10], Step [300/782], Loss: 1.2994
Epoch [2/10], Step [400/782], Loss: 1.2299
Epoch [2/10], Step [500/782], Loss: 1.1591
Epoch [2/10], Step [600/782], Loss: 0.9243
Epoch [2/10], Step [700/782], Loss: 1.1823
Validation Accuracy: 66.11%
Epoch [3/10], Step [100/782], Loss: 0.9658
Epoch [3/10], Step [200/782], Loss: 0.9708
Epoch [3/10], Step [300/782], Loss: 0.9236
Epoch [3/10], Step [400/782], Loss: 1.1136
Epoch [3/10], Step [500/782], Loss: 0.8570
Epoch [3/10], Step [600/782], Loss: 0.7015
Epoch [3/10], Step [700/782], Loss: 0.7606
Validation Accuracy: 70.02%
Epoch [4/10],