In [None]:
!pip install wandb

# import libraries
import torch, torch.nn as nn
import torchvision
import wandb
import torchvision as tv
from torchvision import transforms
import sklearn
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

In [None]:
!wandb login

In [3]:
class FashionMNIST(nn.Module):

  def __init__(self, batch_size = 64, resize = (28, 28), root = './data'):
    super().__init__()
    self.batch_size = batch_size
    self.resize = resize
    self.root = root

    # data augmentation via color jitter and flip
    color_aug = tv.transforms.ColorJitter(brightness = 0.25, contrast = 0.25, saturation = 0.25, hue = 0.25)
    train_transform = transforms.Compose([
            transforms.Resize(resize),
            transforms.RandomHorizontalFlip(),
            color_aug, 
            transforms.ToTensor()
    ])

    # no data augmentation for validation
    val_transform = transforms.Compose([
            transforms.Resize(resize),
            transforms.ToTensor()
    ])

    # access datasets within torchvision
    self.train = tv.datasets.FashionMNIST(root=self.root, train=True , transform=train_transform, download=True)
    self.val   = tv.datasets.FashionMNIST(root=self.root, train=False, transform=val_transform  , download=True)

  def text_labels(self, indices):
    labels = ['T-shirt', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

    return [labels[i] for i in indices]

  def get_dataloader(self, train):
    data = self.train if train else self.val

    # data-iterator reads mini-batch of data
    # key component for efficient performance
    # exploit high-performance cmputing to avoid slowing down training loop
    return torch.utils.data.DataLoader(data, self.batch_size, shuffle = train)

  def train_dataloader(self):
    return self.get_dataloader(train = True)

In [3]:
class CIFAR100(nn.Module):
    def __init__(self, batch_size=64, resize=(32, 32), root="./data"):
        self.batch_size = batch_size
        self.resize = resize
        self.root = root

        # CIFAR-100 normalization constants
        mean = (0.5071, 0.4867, 0.4408)
        std = (0.2675, 0.2565, 0.2761)

        color_aug = tv.transforms.ColorJitter(brightness = 0.25, contrast = 0.25, saturation = 0.25, hue = 0.25)

        # train transforms: augment
        self.train_transform = transforms.Compose([
            transforms.Resize(resize),
            transforms.RandomHorizontalFlip(),
            color_aug, 
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])
        # val transforms: only resize + normalize
        self.val_transform = transforms.Compose([
            transforms.Resize(resize),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])

        self.train = torchvision.datasets.CIFAR100(root=self.root, train=True , transform=self.train_transform, download=True)
        self.val   = torchvision.datasets.CIFAR100(root=self.root, train=False, transform=self.val_transform  , download=True)

        self.classes = self.train.classes

    def get_dataloader(self, train=True):
        data = self.train if train else self.val
        
        return torch.utils.data.DataLoader(data, self.batch_size, shuffle = train)
    
    def train_dataloader(self):
        return self.get_dataloader(train = True)

In [4]:
# initialize weights properly
def init_cnn(module):
  if type(module) == nn.Linear or type(module) == nn.Conv2d:
    nn.init.xavier_uniform_(module.weight)

In [6]:
class LeNet(nn.Module):
  def __init__(self, lr = 0.1, num_classes = 10, activation = 'LeakyReLU', pooling = 'AvgPool2d'):
    super().__init__()
    self.lr = lr
    self.num_classes = num_classes

    # hyperparameters for activation and pooling
    if activation == 'LeakyReLU':
      self.activation = nn.LeakyReLU(0.1)
    elif activation == 'Tanh':
      self.activation = nn.Tanh()
    elif activation == 'GELU':
      self.activation = nn.GELU()

    if pooling == 'AvgPool2d':
      self.pooling = nn.AvgPool2d(kernel_size = 2, stride = 2)
    elif pooling == 'MaxPool2d':
      self.pooling = nn.MaxPool2d(kernel_size = 2, stride = 2)

    if num_classes == 10:
      self.net = nn.Sequential(
        nn.LazyConv2d(6, kernel_size = 5, padding = 2),
        self.activation,
        self.pooling,
        nn.LazyConv2d(16, kernel_size = 5),
        self.activation,
        self.pooling,
        nn.Flatten(),
        nn.LazyLinear(120),
        self.activation,
        nn.LazyLinear(84),
        self.activation,
        nn.LazyLinear(num_classes)
      )
    elif num_classes == 100:
      self.net = nn.Sequential(
        nn.LazyConv2d(32, kernel_size=5, padding=2),
        self.activation,
        self.pooling,
        nn.Conv2d(32, 64, kernel_size=5, padding=2),
        self.activation,
        self.pooling,
        nn.Conv2d(64, 128, kernel_size=3, padding=1),
        self.activation,
        self.pooling,
        nn.Flatten(),
        nn.LazyLinear(256),
        self.activation,
        nn.LazyLinear(128),
        self.activation,
        nn.LazyLinear(num_classes)
      )
      
  # apply initialization to weights
  def apply_init(self, inputs, init_fn):
    self.net(torch.randn(*inputs, dtype=next(self.parameters()).dtype))
    self.net.apply(init_fn)

  # print shape of each layer
  def layer_summary(self, X_shape):
    X = torch.randn(*X_shape)
    for layer in self.net:
      X = layer(X)
      print(f"{layer.__class__.__name__} output shape: \t{X.shape}")

  def train_model(self, data, max_epochs):
    loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = torch.optim.SGD(self.parameters(), lr=self.lr, momentum=0.9, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20], gamma=0.1)

    for epoch in range(max_epochs):
      self.train()
      train_loss, train_acc, num_examples = 0.0, 0.0, 0

      # back prop and optimizer step
      for X, y in data.train_dataloader():
        y_hat = self.net(X)
        loss = loss_fn(y_hat, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * y.shape[0]
        train_acc += (y_hat.argmax(dim = 1) == y).sum().item()
        num_examples += y.shape[0]

      train_loss /= num_examples
      train_acc /= num_examples

      val_loss, val_acc, precision, recall, f1 = self.evaluate_model(data.get_dataloader(train=False))
      scheduler.step()

      # log metrics only at the end of each epoch
      if wandb.run is not None:
        wandb.log({
            "epoch":      epoch + 1,
            "train_loss": train_loss,
            "train_acc":  train_acc,
            "val_loss":   val_loss,
            "val_acc":    val_acc,
            "precision":  precision,
            "recall":     recall,
            "f1":         f1,
        }, step=epoch+1)

  def evaluate_model(self, dataloader, epoch=None):
    self.eval()
    total_correct, total_samples = 0, 0
    total_loss = 0.0
    loss_fn = nn.CrossEntropyLoss()
    all_preds = []
    all_labels = []

    # evaluate model with accuracy, loss, precision, recall, and f1 score
    for X, y in dataloader:
        outputs = self.net(X)
        loss = loss_fn(outputs, y)
        preds = outputs.argmax(dim=1)

        total_correct += (preds == y).sum().item()
        total_loss += loss.item() * y.size(0)
        total_samples += y.size(0)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

    val_acc = total_correct / total_samples
    val_loss = total_loss / total_samples

    precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

    return val_loss, val_acc, precision, recall, f1
  
  def total_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [5]:
class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.relu = nn.ReLU(inplace = True)
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
    
    def forward(self, x):
        identity = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        out += self.shortcut(identity)
        out = self.relu(out)
        
        return out

class ResNet18(nn.Module):
    def __init__(self, lr=0.1, num_classes=10, activation='ReLU', pooling='AvgPool2d'):
        super().__init__()
        self.lr = lr
        self.num_classes = num_classes
        self.activation = nn.ReLU(inplace = True)

        # modified for CIFAR-100: use 3 input channels instead of 1
        # and smaller initial kernel and stride for CIFAR's 32x32 images
        if num_classes == 100:
            self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        # default for MNIST/FashionMNIST
        else:  
            self.conv1 = nn.Conv2d(1, 64, kernel_size=3, stride=2, padding=3, bias=False)
            
        self.bn1 = nn.BatchNorm2d(64)
        
        self.layer1 = self._make_layer(64, 64, 2, stride=1)
        self.layer2 = self._make_layer(64, 128, 2, stride=2)
        self.layer3 = self._make_layer(128, 256, 2, stride=2)
        self.layer4 = self._make_layer(256, 512, 2, stride=2)
        
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        if num_classes == 10:
            self.fc = nn.Linear(512, 10)
        elif num_classes == 100:
            self.fc = nn.Linear(512, 100)
        
        self.net = nn.Sequential(
            self.conv1,
            self.bn1,
            self.activation,
            self.layer1,
            self.layer2,
            self.layer3,
            self.layer4,
            self.avgpool,
            nn.Flatten(),
            self.fc
        )
    
    def _make_layer(self, in_channels, out_channels, num_blocks, stride):
        layers = []
        layers.append(BasicBlock(in_channels, out_channels, stride))
        
        for _ in range(1, num_blocks):
            layers.append(BasicBlock(out_channels, out_channels, 1))
            
        return nn.Sequential(*layers)
    
    def apply_init(self, inputs, init_fn):
        self.net(torch.randn(*inputs, dtype=next(self.parameters()).dtype))
        self.net.apply(init_fn)

    def layer_summary(self, X_shape):
        X = torch.randn(*X_shape)
        for layer in self.net:
            X = layer(X)
            print(f"{layer.__class__.__name__} output shape: \t{X.shape}")

    def train_model(self, data, max_epochs):
        loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
        optimizer = torch.optim.SGD(self.parameters(), lr=self.lr,momentum=0.9, weight_decay=5e-4)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], gamma=0.1)

        for epoch in range(max_epochs):
            self.train()
            train_loss, train_acc, num_examples = 0.0, 0.0, 0

            for X, y in data.train_dataloader():
                y_hat = self.net(X)
                loss = loss_fn(y_hat, y)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                train_loss += loss.item() * y.shape[0]
                train_acc += (y_hat.argmax(dim=1) == y).sum().item()
                num_examples += y.shape[0]

            scheduler.step()
            
            train_loss /= num_examples
            train_acc /= num_examples

            val_loss, val_acc, precision, recall, f1 = self.evaluate_model(data.get_dataloader(train=False))

            if wandb.run is not None:
                wandb.log({
                    "epoch":      epoch + 1,
                    "train_loss": train_loss,
                    "train_acc":  train_acc,
                    "val_loss":   val_loss,
                    "val_acc":    val_acc,
                    "precision":  precision,
                    "recall":     recall,
                    "f1":         f1,
                }, step=epoch+1)

    def evaluate_model(self, dataloader, epoch=None):
        self.eval()
        total_correct, total_samples = 0, 0
        total_loss = 0.0
        loss_fn = nn.CrossEntropyLoss()
        all_preds = []
        all_labels = []

        for X, y in dataloader:
            outputs = self.net(X)
            loss = loss_fn(outputs, y)
            preds = outputs.argmax(dim=1)

            total_correct += (preds == y).sum().item()
            total_loss += loss.item() * y.size(0)
            total_samples += y.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

        val_acc = total_correct / total_samples
        val_loss = total_loss / total_samples

        precision = precision_score(all_labels, all_preds, average='weighted')
        recall = recall_score(all_labels, all_preds, average='weighted')
        f1 = f1_score(all_labels, all_preds, average='weighted')

        return val_loss, val_acc, precision, recall, f1
    
    def total_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [14]:
model = LeNet()
# (batch_size, num_channels, height, width)
model.layer_summary((1, 1, 28, 28))
print(f'Total number of parameters: {model.total_parameters()}')

Conv2d output shape: 	torch.Size([1, 6, 28, 28])
LeakyReLU output shape: 	torch.Size([1, 6, 28, 28])
AvgPool2d output shape: 	torch.Size([1, 6, 14, 14])
Conv2d output shape: 	torch.Size([1, 16, 10, 10])
LeakyReLU output shape: 	torch.Size([1, 16, 10, 10])
AvgPool2d output shape: 	torch.Size([1, 16, 5, 5])
Flatten output shape: 	torch.Size([1, 400])
Linear output shape: 	torch.Size([1, 120])
LeakyReLU output shape: 	torch.Size([1, 120])
Linear output shape: 	torch.Size([1, 84])
LeakyReLU output shape: 	torch.Size([1, 84])
Linear output shape: 	torch.Size([1, 10])
Total number of parameters: 61706


In [6]:
model = ResNet18()
# (batch_size, num_channels, height, width)
model.layer_summary((1, 1, 28, 28))
print(f'Total number of parameters: {model.total_parameters()}')

Conv2d output shape: 	torch.Size([1, 64, 16, 16])
BatchNorm2d output shape: 	torch.Size([1, 64, 16, 16])
LeakyReLU output shape: 	torch.Size([1, 64, 16, 16])
Sequential output shape: 	torch.Size([1, 64, 16, 16])
Sequential output shape: 	torch.Size([1, 128, 8, 8])
Sequential output shape: 	torch.Size([1, 256, 4, 4])
Sequential output shape: 	torch.Size([1, 512, 2, 2])
AdaptiveAvgPool2d output shape: 	torch.Size([1, 512, 1, 1])
Flatten output shape: 	torch.Size([1, 512])
Linear output shape: 	torch.Size([1, 10])
Total number of parameters: 11172810


## LeNet on FashionMNIST

In [None]:
# figure out how to configure function to run with different parameters: model and dataset
def run_experiment_leNet(activation, pooling, max_epochs = 25):
    wandb.init(
        project="CNNs-FashionMNIST-LeNet",
        name=f"{activation}, {pooling}",
        config={
            "activation": activation,
            "pooling": pooling,
            "epochs": max_epochs
        }
    )

    config = wandb.config
    data = FashionMNIST(batch_size=128)
    model = LeNet(
        activation=config.activation,
        pooling=config.pooling,
        lr=0.1
    )
    model.apply_init(next(iter(data.get_dataloader(True)))[0].shape, init_cnn)
    model.train_model(data, max_epochs=config.epochs)

    wandb.finish()


activations = ['LeakyReLU', 'Tanh', 'GELU']
pooling = ['AvgPool2d', 'MaxPool2d']

for act in activations:
    for pool in pooling:
        run_experiment_leNet(activation = act, pooling = pool)

## ResNet-18 on FashionMNIST

In [None]:
def run_experiment_ResNet(activation, pooling, max_epochs = 25):
    wandb.init(
        project="CNNs-FashionMNIST-ResNet18",
        name=f"{activation}, {pooling}",
        config={
            "activation": activation,
            "pooling": pooling,
            "epochs": max_epochs
        }
    )

    config = wandb.config
    data = FashionMNIST(batch_size=128)
    model = ResNet18(
        activation=config.activation,
        pooling=config.pooling,
        lr=0.1
    )
    model.apply_init(next(iter(data.get_dataloader(True)))[0].shape, init_cnn)
    model.train_model(data, max_epochs=config.epochs)

    wandb.finish()


activations = ['LeakyReLU', 'Tanh', 'GELU']
pooling = ['AvgPool2d', 'MaxPool2d']

for act in activations:
    for pool in pooling:
        run_experiment_ResNet(activation = act, pooling = pool)

## LeNet on CIFAR100

In [10]:
model = LeNet(num_classes = 100)
# (batch_size, num_channels, height, width)
# CIFAR100 images are 32x32 with 3 color channels
model.layer_summary((1, 3, 32, 32))
print(f'Total number of parameters: {model.total_parameters()}')

Conv2d output shape: 	torch.Size([1, 32, 32, 32])
LeakyReLU output shape: 	torch.Size([1, 32, 32, 32])
AvgPool2d output shape: 	torch.Size([1, 32, 16, 16])
Conv2d output shape: 	torch.Size([1, 64, 16, 16])
LeakyReLU output shape: 	torch.Size([1, 64, 16, 16])
AvgPool2d output shape: 	torch.Size([1, 64, 8, 8])
Conv2d output shape: 	torch.Size([1, 128, 8, 8])
LeakyReLU output shape: 	torch.Size([1, 128, 8, 8])
AvgPool2d output shape: 	torch.Size([1, 128, 4, 4])
Flatten output shape: 	torch.Size([1, 2048])
Linear output shape: 	torch.Size([1, 256])
LeakyReLU output shape: 	torch.Size([1, 256])
Linear output shape: 	torch.Size([1, 128])
LeakyReLU output shape: 	torch.Size([1, 128])
Linear output shape: 	torch.Size([1, 100])
Total number of parameters: 697892


In [None]:
# figure out how to configure function to run with different parameters: model and dataset
def run_experiment_leNet(activation, pooling, max_epochs = 30):
    wandb.init(
        project="CNNs-CIFAR100-LeNet-v1",
        name=f"{activation}, {pooling}",
        config={
            "activation": activation,
            "pooling": pooling,
            "epochs": max_epochs
        }
    )

    config = wandb.config
    data = CIFAR100(batch_size=128)
    model = LeNet(
        activation=config.activation,
        pooling=config.pooling,
        num_classes = 100,
        lr=0.1
    )
    model.apply_init(next(iter(data.get_dataloader(True)))[0].shape, init_cnn)
    model.train_model(data, max_epochs=config.epochs)

    wandb.finish()


activations = ['LeakyReLU', 'GELU']
pooling = ['MaxPool2d']

for act in activations:
    for pool in pooling:
        run_experiment_leNet(activation = act, pooling = pool)

## Resnet-18 on CIFAR100

In [7]:
model = ResNet18(num_classes = 100)
# (batch_size, num_channels, height, width)
model.layer_summary((1, 3, 32, 32))
print(f'Total number of parameters: {model.total_parameters()}')

Conv2d output shape: 	torch.Size([1, 64, 32, 32])
BatchNorm2d output shape: 	torch.Size([1, 64, 32, 32])
LeakyReLU output shape: 	torch.Size([1, 64, 32, 32])
Sequential output shape: 	torch.Size([1, 64, 32, 32])
Sequential output shape: 	torch.Size([1, 128, 16, 16])
Sequential output shape: 	torch.Size([1, 256, 8, 8])
Sequential output shape: 	torch.Size([1, 512, 4, 4])
AdaptiveAvgPool2d output shape: 	torch.Size([1, 512, 1, 1])
Flatten output shape: 	torch.Size([1, 512])
Linear output shape: 	torch.Size([1, 100])
Total number of parameters: 11220132


In [None]:
def run_experiment_ResNet(max_epochs = 200):
    wandb.init(
        project="CNNs-CIFAR100-ResNet18",
        name=f"ResNet18-updated",
        config={
            "epochs": max_epochs
        }
    )

    config = wandb.config
    data = CIFAR100(batch_size=128)
    model = ResNet18(
        num_classes = 100,
        lr=0.1
    )
    model.apply_init(next(iter(data.get_dataloader(True)))[0].shape, init_cnn)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
    model.train_model(data, max_epochs=config.epochs)

    # Save the model with more information for better restoration
    save_path = 'resnet18_cifar100.pth'
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epochs': config.epochs,
        'model_config': {
            'num_classes': 100,
            'lr': 0.1
        }
    }, save_path)
    
    # Verify the save was successful
    try:
        checkpoint = torch.load(save_path)
        print(f'Model successfully saved to {save_path}')
        print(f'Saved model trained for {checkpoint["epochs"]} epochs')
    except Exception as e:
        print(f'Error verifying saved model: {e}')
    
    wandb.finish()

run_experiment_ResNet()