# CIFAR-10 Demo

ACM Cyber Special Topics Track Week 3 (ft. ACM AI)

In [1]:
# numpy: the library that deals with linear algebra calculations
import numpy as np
# (py)torch: our machine learning library!
import torch
import torch.nn.functional as F
from torch import nn, optim, utils
from torchvision import datasets, transforms
# matplotlib: making plots & displaying images
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")  # gpu training

Device: cuda


## Data

We are using CIFAR-10.

In [3]:
# pre-process images
transform = transforms.Compose([transforms.ToTensor()])  # convert image to pytorch tensor

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)

test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = utils.data.DataLoader(test_dataset, batch_size=128, shuffle=True, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


In [4]:
data_augmentation = nn.Sequential(transforms.RandomHorizontalFlip(),
                                  transforms.RandomCrop((32, 32), padding=(4, 4)))
data_augmentation = torch.jit.script(data_augmentation)

## Network

In [5]:
class CNNLarge(nn.Module):

    def __init__(self):
        super().__init__()

        self.network = nn.Sequential(
            # layer 1: 3 x 32 x 32 -> 32 x 32 x 32 (format: channel x width x height)
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            # layer 2: 32 x 32 x 32 -> 32 x 16 x 16
            nn.Conv2d(32, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            # layer 3: 32 x 16 x 16 -> 64 x 16 x 16
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            # layer 4: 64 x 16 x 16 -> 64 x 8 x 8
            nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            # layer 5: 64 x 8 x 8 -> 64 x 8 x 8
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            # layer 6: 64 x 8 x 8 -> 128 x 4 x 4
            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(128),
            # flatten and output fully-connected layer
            nn.Flatten(),
            nn.Linear(128 * 4 * 4, 10))
    
    def forward(self, x):
        x = self.network(x)
        return x

    def size(self):
        parameters = self.parameters()
        size = 0
        for parameter in parameters:
            size += np.prod(parameter.shape)
        return size

In [6]:
class CNNSmall(nn.Module):

    def __init__(self):
        super().__init__()

        self.network = nn.Sequential(
            # layer 1: 3 x 32 x 32 -> 32 x 16 x 16 (format: channel x width x height)
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            # layer 2: 32 x 16 x 16 -> 64 x 8 x 8
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            # layer 3: 64 x 8 x 8 -> 64 x 4 x 4
            nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            # flatten and output fully-connected layer
            nn.Flatten(),
            nn.Linear(64 * 4 * 4, 10))
    
    def forward(self, x):
        x = self.network(x)
        return x

    def size(self):
        parameters = self.parameters()
        size = 0
        for parameter in parameters:
            size += np.prod(parameter.shape)
        return size

In [7]:
network = CNNLarge()  # let's use the large one :D
network.to(device)  # put network (i.e., network parameters) on gpu
print(f"Network size: {network.size()}")

Network size: 197610


## Criterion and Optimizer

In [8]:
i_max = 64000

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(network.parameters(), lr=0.1, momentum=0.9, nesterov=True)
# learning rate scheduler follows the original ResNet paper: https://arxiv.org/abs/1512.03385
# I have found that this scheduler works quite well for batch normalization deep models
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[i_max // 2, i_max // 4 * 3],
                                           gamma=0.1)

## Training

I know it is bad practice to use the test set for both validation and testing, but splitting the train set to create a validation set just needlessly complicates what we are trying to do here (that is, to train some base model for the FGSM demo).

In [9]:
@torch.no_grad()
def evaluate(loader, network, criterion):  # not the best implementation :P
    losses = []
    accuracies = []
    for inputs, labels in loader:
        inputs = inputs.to(device)  # put inputs and labels on gpu
        labels = labels.to(device)
        outputs = network(inputs)  # pass inputs through network to get outputs
        loss = criterion(outputs, labels)  # evaluate outputs with criterion to get loss
        accuracy = (torch.max(outputs, dim=1)[1] == labels).to(torch.float32).mean()  # accuracy
        losses.append(loss.cpu().numpy())
        accuracies.append(accuracy.cpu().numpy())
    return np.mean(losses), np.mean(accuracies)

In [10]:
progress = tqdm(total=i_max)

i = 0
while i < i_max:  # jank way to do exactly i_max iterations
    for inputs, labels in train_loader:
        network.train()
        inputs = inputs.to(device)  # put inputs and labels on gpu
        labels = labels.to(device)
        inputs = data_augmentation(inputs)
        optimizer.zero_grad()  # zero-out gradients
        outputs = network(inputs)  # pass inputs through network to get outputs
        loss = criterion(outputs, labels)  # evaluate outputs bwith criterion to get loss
        loss.backward()  # backpropagate through loss to compute gradients
        optimizer.step()  # use gradients to perform SGD
        scheduler.step()  # update learning rate
        if (i + 1) % (i_max // 50) == 0:  # evaluate on train and test datasets, print results
            network.eval()
            train_loss, train_accuracy = evaluate(train_loader, network, criterion)
            test_loss, test_accuracy = evaluate(test_loader, network, criterion)
            result = f"{('[' + str(i + 1) + ']'):8s}   "\
                     f"Train: {str(train_accuracy * 100):.6}% ({str(train_loss):.6})   "\
                     f"Test: {str(test_accuracy * 100):.6}% ({str(test_loss):.6})"
            progress.write(result)
        i += 1  # update iterations
        progress.update(1)
        if i >= i_max:  # stop at maximum iterations
            break

  0%|          | 0/64000 [00:00<?, ?it/s]

 does not have profile information (Triggered internally at /opt/conda/conda-bld/pytorch_1670525539683/work/torch/csrc/jit/codegen/cuda/graph_fuser.cpp:105.)
  return forward_call(*input, **kwargs)


[1280]     Train: 56.162% (1.2386)   Test: 55.814% (1.2574)
[2560]     Train: 65.125% (0.9957)   Test: 64.369% (1.0239)
[3840]     Train: 71.697% (0.8154)   Test: 69.580% (0.8718)
[5120]     Train: 74.761% (0.7215)   Test: 73.071% (0.7886)
[6400]     Train: 78.836% (0.6014)   Test: 75.919% (0.6930)
[7680]     Train: 80.143% (0.5710)   Test: 77.689% (0.6544)
[8960]     Train: 81.309% (0.5390)   Test: 78.411% (0.6385)
[10240]    Train: 80.572% (0.5742)   Test: 77.403% (0.6903)
[11520]    Train: 82.977% (0.4912)   Test: 79.657% (0.6114)
[12800]    Train: 83.845% (0.4553)   Test: 80.528% (0.5772)
[14080]    Train: 85.220% (0.4290)   Test: 81.685% (0.5609)
[15360]    Train: 85.676% (0.4084)   Test: 81.635% (0.5478)
[16640]    Train: 85.875% (0.4072)   Test: 81.971% (0.5494)
[17920]    Train: 86.492% (0.3927)   Test: 82.288% (0.5439)
[19200]    Train: 86.423% (0.3855)   Test: 82.179% (0.5575)
[20480]    Train: 87.037% (0.3760)   Test: 82.446% (0.5637)
[21760]    Train: 87.197% (0.3661)   Tes

## Saving Weights

In [11]:
torch.save(network.state_dict(), "cifar10_model.pth")

## Loading Weights & Evaluate

In [12]:
network_load = CNNLarge()
network_load.load_state_dict(torch.load("cifar10_model.pth"))
network_load.to(device)
network.eval()

CNNLarge(
  (network): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (4): ReLU()
    (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (10): ReLU()
    (11): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU()
    (14): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (15): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), 

In [13]:
test_loss, test_accuracy = evaluate(test_loader, network_load, criterion)
print(f"Test accuracy: {str(test_accuracy * 100):.6}%     Test loss: {str(test_loss):.6}")

Test accuracy: 86.570%     Test loss: 0.4508
