Imports

In [1]:
import torch
import os
import requests
from zipfile import ZipFile
from io import BytesIO
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Subset
import numpy as np
import time
import torch.nn.functional as F
from torch import Tensor
from typing import Type
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter

Import data

In [2]:
# Define the path to the dataset
dataset_path = 'http://cs231n.stanford.edu/tiny-imagenet-200.zip'

# Send a GET request to the URL
response = requests.get(dataset_path)
# Check if the request was successful
if response.status_code == 200:
    # Open the downloaded bytes and extract them
    with ZipFile(BytesIO(response.content)) as zip_file:
        zip_file.extractall('/dataset')
    print('Download and extraction complete!')

Download and extraction complete!


## Define and train AlexNet

In [3]:
# Define the AlexNet architecture
class AlexNet(nn.Module):
    def __init__(self, num_classes=200):  # Tiny ImageNet has 200 classes
        super(AlexNet, self).__init__()
        # Define the layers of AlexNet
        self.conv1 = nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2)
        self.pool1 = nn.MaxPool2d(3, stride=2)

        self.conv2 = nn.Conv2d(64, 192, kernel_size=5, stride=1, padding=2)
        self.pool2 = nn.MaxPool2d(3, stride=2)

        self.conv3 = nn.Conv2d(192, 384, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1)

        self.conv5 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.pool5 = nn.MaxPool2d(3, stride=2)

        self.fc1 = nn.Linear(9216, 4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.fc3 = nn.Linear(4096, num_classes)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.pool5(F.relu(self.conv5(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return x


In [11]:
# Hyperparameters
learning_rates = [0.1, 0.001, 0.0001]
batch_sizes = [16, 32, 64]

#Keep number of epochs low as i don't have enough computing power
EPOCHS = 2

# Define transforms for the input data
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to a slightly larger square
    transforms.RandomHorizontalFlip(),  # Randomly flip images horizontally
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),  # Randomly jitter color
    transforms.RandomRotation(10),  # Randomly rotate images within a 10 degree range
    transforms.ToTensor(),  # Convert the image to a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize
])

# Load the Tiny ImageNet dataset
# Note: You'll need to download the dataset and set the correct path.
train_dataset = datasets.ImageFolder(root='/dataset/tiny-imagenet-200/train', transform=transform)
val_dataset = datasets.ImageFolder(root='/dataset/tiny-imagenet-200/test', transform=transform)

#define loss function
criterion = nn.CrossEntropyLoss()


In [5]:
ratio = 10

train_size = len(train_dataset) // ratio
val_size = len(val_dataset) // ratio
# Create a smaller subset by randomly sampling indices
subset_indices_train = torch.randperm(len(train_dataset))[:train_size]
subset_indices_val = torch.randperm(len(val_dataset))[:val_size]

train_dataset = Subset(train_dataset, subset_indices_train)
val_dataset = Subset(val_dataset, subset_indices_val)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Test the model
tensor = torch.rand([1, 3, 224, 224]).to(device)

model = AlexNet(num_classes=200).to(device)
print(model)

# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")
output = model(tensor)

AlexNet(
  (conv1): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
  (pool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (pool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=9216, out_features=4096, bias=True)
  (fc2): Linear(in_features=4096, out_features=4096, bias=True)
  (fc3): Linear(in_features=4096, out_features=200, bias=True)
)
57,823,240 total parameters.
57,823,240 training parameters.


In [12]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(train_loader):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs.to(device))

        # Compute the loss and its gradients
        loss = criterion(outputs, labels.to(device))
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [13]:
for lr in learning_rates:
    #Set optimizer with given learning rate
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    for batch_size in batch_sizes:
      print(f'BATCH SIZE: {batch_size} LEARNING RATE: {lr}')
      #Set loaders with given batch size
      train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
      test_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


      timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
      writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
      epoch_number = 0

      best_vloss = 1_000_000.

      for epoch in range(EPOCHS):
          print('EPOCH {}:'.format(epoch_number + 1))

          # Make sure gradient tracking is on, and do a pass over the data
          model.train(True)
          avg_loss = train_one_epoch(epoch_number, writer)


          running_vloss = 0.0
          # Set the model to evaluation mode, disabling dropout and using population
          # statistics for batch normalization.
          model.eval()

          # Disable gradient computation and reduce memory consumption.
          with torch.no_grad():
              for i, vdata in enumerate(test_loader):
                  vinputs, vlabels = vdata
                  voutputs = model(vinputs.to(device))
                  vloss = criterion(voutputs, vlabels.to(device))
                  running_vloss += vloss

          avg_vloss = running_vloss / (i + 1)
          print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

          # Log the running loss averaged per batch
          # for both training and validation
          writer.add_scalars('Training vs. Validation Loss',
                          { 'Training' : avg_loss, 'Validation' : avg_vloss },
                          epoch_number + 1)
          writer.flush()

          # Track best performance, and save the model's state
          if avg_vloss < best_vloss:
              best_vloss = avg_vloss
              model_path = 'model_{}_{}'.format(timestamp, epoch_number)
              torch.save(model.state_dict(), model_path)

          epoch_number += 1

BATCH SIZE: 16 LEARNING RATE: 0.1
EPOCH 1:
  batch 1000 loss: 5.298594066619873
  batch 2000 loss: 5.2983174324035645
  batch 3000 loss: 5.2983174324035645
  batch 4000 loss: 5.2983174324035645
  batch 5000 loss: 5.2983174324035645
  batch 6000 loss: 5.2983174324035645
LOSS train 5.2983174324035645 valid 5.298336029052734
EPOCH 2:
  batch 1000 loss: 5.2983174324035645
  batch 2000 loss: 5.2983174324035645
  batch 3000 loss: 5.2983174324035645
  batch 4000 loss: 5.2983174324035645
  batch 5000 loss: 5.2983174324035645
  batch 6000 loss: 5.2983174324035645
LOSS train 5.2983174324035645 valid 5.298336029052734
BATCH SIZE: 32 LEARNING RATE: 0.1
EPOCH 1:
  batch 1000 loss: 5.298318862915039
  batch 2000 loss: 5.298318862915039
  batch 3000 loss: 5.298318862915039
LOSS train 5.298318862915039 valid 5.2983317375183105
EPOCH 2:
  batch 1000 loss: 5.298318862915039
  batch 2000 loss: 5.298318862915039
  batch 3000 loss: 5.298318862915039
LOSS train 5.298318862915039 valid 5.2983317375183105
BAT

## Define and train ResNet

In [14]:
#Define basic block for ResNet
class BasicBlock(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int = 1,
        expansion: int = 1,
        downsample: nn.Module = None
    ) -> None:
        super(BasicBlock, self).__init__()
        # Multiplicative factor for the subsequent conv2d layer's output channels.
        # It is 1 for ResNet18 and ResNet34.
        self.expansion = expansion
        self.downsample = downsample
        self.conv1 = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=3,
            stride=stride,
            padding=1,
            bias=False
        )
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(
            out_channels,
            out_channels*self.expansion,
            kernel_size=3,
            padding=1,
            bias=False
        )
        self.bn2 = nn.BatchNorm2d(out_channels*self.expansion)
    def forward(self, x: Tensor) -> Tensor:
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            identity = self.downsample(x)
        out += identity
        out = self.relu(out)
        return  out

In [15]:
class ResNet(nn.Module):
    def __init__(
        self,
        img_channels: int,
        num_layers: int,
        block: Type[BasicBlock],
        num_classes: int  = 200
    ) -> None:
        super(ResNet, self).__init__()
        if num_layers == 18:
            # The following `layers` list defines the number of `BasicBlock`
            # to use to build the network and how many basic blocks to stack
            # together.
            layers = [2, 2, 2, 2]
            self.expansion = 1

        self.in_channels = 64
        # All ResNets (18 to 152) contain a Conv2d => BN => ReLU for the first
        # three layers. Here, kernel size is 7.
        self.conv1 = nn.Conv2d(
            in_channels=img_channels,
            out_channels=self.in_channels,
            kernel_size=7,
            stride=2,
            padding=3,
            bias=False
        )
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512*self.expansion, num_classes)
    def _make_layer(
        self,
        block: Type[BasicBlock],
        out_channels: int,
        blocks: int,
        stride: int = 1
    ) -> nn.Sequential:
        downsample = None
        if stride != 1:
            """
            This should pass from `layer2` to `layer4` or
            when building ResNets50 and above. Section 3.3 of the paper
            Deep Residual Learning for Image Recognition
            (https://arxiv.org/pdf/1512.03385v1.pdf).
            """
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.in_channels,
                    out_channels*self.expansion,
                    kernel_size=1,
                    stride=stride,
                    bias=False
                ),
                nn.BatchNorm2d(out_channels * self.expansion),
            )
        layers = []
        layers.append(
            block(
                self.in_channels, out_channels, stride, self.expansion, downsample
            )
        )
        self.in_channels = out_channels * self.expansion
        for i in range(1, blocks):
            layers.append(block(
                self.in_channels,
                out_channels,
                expansion=self.expansion
            ))
        return nn.Sequential(*layers)
    def forward(self, x: Tensor) -> Tensor:
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        # The spatial dimension of the final layer's feature
        # map should be (7, 7) for all ResNets.
        #print('Dimensions of the last convolutional feature map: ', x.shape)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Test the model
tensor = torch.rand([1, 3, 224, 224]).to(device)

model = ResNet(img_channels=3, num_layers=18, block=BasicBlock, num_classes=200).to(device)
print(model)

# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")
output = model(tensor)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [17]:
#Define hyperparameters
EPOCHS = 2
learning_rates = [0.1, 0.001, 0.0001]
batch_sizes = [16, 32, 64]

In [18]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(train_loader):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs.to(device))

        # Compute the loss and its gradients
        loss = criterion(outputs, labels.to(device))
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [None]:
for lr in learning_rates:
    #Set optimizer with given learning rate
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    print(f'LEARNING RATE: {lr}')
    for batch_size in batch_sizes:
      print(f'BATCH SIZE: {batch_size}')
      #Set loaders with given batch size
      train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
      test_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


      timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
      writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
      epoch_number = 0

      best_vloss = 1_000_000.

      for epoch in range(EPOCHS):
          print('EPOCH {}:'.format(epoch_number + 1))

          # Make sure gradient tracking is on, and do a pass over the data
          model.train(True)
          avg_loss = train_one_epoch(epoch_number, writer)


          running_vloss = 0.0
          # Set the model to evaluation mode, disabling dropout and using population
          # statistics for batch normalization.
          model.eval()

          # Disable gradient computation and reduce memory consumption.
          with torch.no_grad():
              for i, vdata in enumerate(test_loader):
                  vinputs, vlabels = vdata
                  voutputs = model(vinputs.to(device))
                  vloss = criterion(voutputs, vlabels.to(device))
                  running_vloss += vloss

          avg_vloss = running_vloss / (i + 1)
          print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

          # Log the running loss averaged per batch
          # for both training and validation
          writer.add_scalars('Training vs. Validation Loss',
                          { 'Training' : avg_loss, 'Validation' : avg_vloss },
                          epoch_number + 1)
          writer.flush()

          # Track best performance, and save the model's state
          if avg_vloss < best_vloss:
              best_vloss = avg_vloss
              model_path = 'model_{}_{}'.format(timestamp, epoch_number)
              torch.save(model.state_dict(), model_path)

          epoch_number += 1

LEARNING RATE: 0.1
BATCH SIZE: 16
EPOCH 1:
  batch 1000 loss: 5.3399882373809815
  batch 2000 loss: 5.186742506027222
  batch 3000 loss: 5.085517208576203
  batch 4000 loss: 4.9906835675239565
  batch 5000 loss: 4.8511982479095455
  batch 6000 loss: 4.770976055860519
LOSS train 4.770976055860519 valid 6.9782304763793945
EPOCH 2:
  batch 1000 loss: 4.6696875741481785
  batch 2000 loss: 4.564825007438659
  batch 3000 loss: 4.512168281078338
  batch 4000 loss: 4.466344439029694
  batch 5000 loss: 4.392468120574951
  batch 6000 loss: 4.344879398822784
LOSS train 4.344879398822784 valid 6.855801582336426
BATCH SIZE: 32
EPOCH 1:
  batch 1000 loss: 3.995617347240448
  batch 2000 loss: 3.910241993188858
  batch 3000 loss: 3.8595100293159486
LOSS train 3.8595100293159486 valid 7.487232685089111
EPOCH 2:
  batch 1000 loss: 3.7645210614204405
  batch 2000 loss: 3.7261436128616334
  batch 3000 loss: 3.6837412023544314
LOSS train 3.6837412023544314 valid 7.159038543701172
BATCH SIZE: 64
EPOCH 1:
  

Because of lack of computing power we have only used 2 epochs, this leads to high losses and not accurate results for a real world test. However we can deduce that some hyperparameters had better results for the epochs and batches than others ie. they converged faster. We also had problems with convergence on the AlexNet not converging, this could possibly be solved by altering the optimizer parameters for instance the weight decay.