## Exercise 2: Train classifier on IMAGENET


Initilize the Libraries I may plan to use:



In [1]:
import time
from typing import List, Dict

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import os
from timm.data.mixup import Mixup
from torch.cuda.amp import autocast, GradScaler  # Enable mixed precision training if I choose to use it
import matplotlib.pyplot as plt

# Set Train and Test Functions:

In [8]:
def train(model: nn.Module,
          loss_fn: nn.modules.loss._Loss,
          optimizer: torch.optim.Optimizer,
          train_loader: torch.utils.data.DataLoader,
          epoch: int=0,
          checkpoint_path: str = '/content/drive/MyDrive/ColabNotebooks/GhostNet/test.pth') -> List:
    if os.path.exists(checkpoint_path):
        print(f"Loading checkpoint from {checkpoint_path}...")
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']  # Resume from the saved epoch
        print(f"Resuming from epoch {epoch}")
    else:
        print("No checkpoint found, starting fresh.")
        epoch = 0  # Ensure the epoch starts from 0 if no checkpoint is found

    # Set the model to training mode:
    model.train()

    # Initialize the training loss list:
    train_loss = []
    # Get the batch value, input images, and targets and loop through each batch:
    for batch_idx, (images, targets) in enumerate(train_loader):
        # Move images and targets to GPU:
        images, targets = images.to(device), targets.to(device)

        # Forward pass: Get the output labels from the model
        outputs = model(images)

        # Calculate the loss using Cross Entropy:
        loss = loss_fn(outputs, targets)

        # Perform the backward pass and optimization step:
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update the loss for the batch to the train_loss list:
        train_loss.append(loss.item())

        # Print the loss around every 100 batches for feedback:
        if (batch_idx + 1) % 100 == 0:
            print(f'Epoch {epoch + 1}: [{batch_idx * len(images)}/{len(train_loader.dataset)}] Loss: {loss.item():.3f}')

    # Save checkpoint after each epoch:
    print(f'Saving checkpoint for epoch {epoch + 1}...')
    checkpoint = {
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss.item(),
    }
    torch.save(checkpoint, checkpoint_path)

    # Ensure the length of train_loss matches the number of batches for verification purposes:
    assert len(train_loss) == len(train_loader)

    return train_loss

def test(model: nn.Module,
         loss_fn: nn.modules.loss._Loss,
         test_loader: torch.utils.data.DataLoader,
         epoch: int = 0) -> Dict:
    # Set the model up in Eval Mode:
    model.eval()

    # Initialize the counters to determine test stats:
    correct, total_num, loss = 0, 0, 0
    predictions = []

    # Loop through the testing images and send them to the GPU to calculate the predictive labels:
    for batch_idx, (inputs, targets) in enumerate(test_loader):
        #Move to GPU:
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass through the model:
        outputs = model(inputs)

        # Get the predicted class labels and store them:
        _, predicted = torch.max(outputs, 1)
        predictions.append(predicted)

        # Calculate accuracy:
        correct += (predicted == targets).sum().item()
        total_num += targets.size(0)

        # Calculate loss:
        loss += loss_fn(outputs, targets).item()

    # Calculate the accuracy and average loss
    accuracy = correct / total_num
    avg_loss = loss / len(test_loader)

    # Put all predictions together:
    predictions = torch.cat(predictions, dim=0)

    # Create the Dictionary for test_set:
    test_stat = {
      'loss': avg_loss,         # Store average loss
      'accuracy': accuracy,     # Store accuracy
      'prediction': predictions # Store predictions
    }

    # Print test statistics
    print(f"Test result on epoch {epoch+1}: total sample: {total_num}, Avg loss: {test_stat['loss']:.3f}, Acc: {100*test_stat['accuracy']:.3f}%")

    # Assert the keys and prediction tensor shape:
    assert "loss" in test_stat.keys()
    assert "accuracy" in test_stat.keys()
    assert "prediction" in test_stat.keys()
    assert len(test_stat["prediction"]) == len(test_loader.dataset)
    assert isinstance(test_stat["prediction"], torch.Tensor)

    return test_stat

# Experiment #1:

Now that I have a baseline between the two models using CIFAR-10, I can use a pre-trained model on ImageNet (Original GhostNet Model) in order to look at how spacial awareness is influenced on a larger image and dataset.
* I did not find much if not any benifit in adding CBAM to the model instead of SE. This may be due to the very small architecture that makes up GhostNet.

* CIFAR-10 uses 32x32 images, so the spacial awareness component of my Reimplementation model is likely minimal if not even unproductive to the training process. Therefore, using a pretrainied model to assist in training GhostNet and GhostNet_N on a larger image will allow me to analyze the efficiency of spacial awareness on a larger image.

# Create Data Loaders for ImageNet:

In [3]:
# Establish Dataset and transforms for Images:
transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

#Mount Google Drive:
from google.colab import drive
drive.mount('/content/drive')

train_dataset = datasets.ImageFolder(root='/content/drive/MyDrive/ColabNotebooks/GhostNet/ImageNet/dir/train', transform=transform)
val_dataset = datasets.ImageFolder(root='/content/drive/MyDrive/ColabNotebooks/GhostNet/ImageNet/dir/val', transform=transform)

#Data Loaders:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)


Mounted at /content/drive


# Test the Original Model on ImageNet:

* This will use the pretrainied weights given by the author's of my reference source for the model.


In [5]:
#Set up the pre-trained path:
pretrained_path = "/content/drive/MyDrive/ColabNotebooks/GhostNet/models/state_dict_73.98.pth"

#Import the Original Author Model:
import sys
sys.path.append('/content/drive/MyDrive/ColabNotebooks/GhostNet')
from ghostnet import GhostNet     #Import Author's Model

#Original Model Configuration:
cfgs_original = [

        # stage1
        [[3,  16,  16, 0, 1]],
        # stage2
        [[3,  48,  24, 0, 2]],
        [[3,  72,  24, 0, 1]],
        # stage3
        [[5,  72,  40, 0.25, 2]],
        [[5, 120,  40, 0.25, 1]],
        # stage4
        [[3, 240,  80, 0, 2]],
        [[3, 200,  80, 0, 1],
         [3, 184,  80, 0, 1],
         [3, 184,  80, 0, 1],
         [3, 480, 112, 0.25, 1],
         [3, 672, 112, 0.25, 1]
        ],
        # stage5
        [[5, 672, 160, 0.25, 2]],
        [[5, 960, 160, 0, 1],
         [5, 960, 160, 0.25, 1],
         [5, 960, 160, 0, 1],
         [5, 960, 160, 0.25, 1]
        ]
    ]

# Initialize model (train from scratch as the author only has pretrained model on ImageNet)
model_original = GhostNet(cfgs_original, num_classes=10, width=1.0, dropout=0.2)

#Initialize model (No Training - Use pretrained weights)
model_orig_ImageNet = GhostNet(cfgs_original, num_classes=1000, width=1.0, dropout=0.2)

#Load pretrained weights
checkpoint = torch.load(pretrained_path, map_location='cpu')

#Load state_dict into model:
model_orig_ImageNet.load_state_dict(checkpoint)

#Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_orig_ImageNet = model_orig_ImageNet.to(device)

#Test the model on my ImageNet dataset (It is a subset of ImageNet):
#Set up criterion and psuedo epoch:
criterion = nn.CrossEntropyLoss()
epoch = 0

with torch.no_grad():
    output_orig = test(model_orig_ImageNet, criterion, val_loader, epoch)


Process Process-1:
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/process.py", line 307, in _bootstrap
    self._after_fork()
  File "/usr/lib/python3.11/multiprocessing/context.py", line 228, in _after_fork
    return _default_context.get_context().Process._after_fork()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 342, in _after_fork
    util._run_after_forkers()
  File "/usr/lib/python3.11/multiprocessing/util.py", line 170, in _run_after_forkers
    func(obj)
  File "/usr/lib/python3.11/multiprocessing/resource_sharer.py", line 111, in _afterfork
    def _afterfork(self):
    
KeyboardInterrupt


KeyboardInterrupt: 

# Use the Pre-Trained model to train my Reimplementation on ImageNet:
* This will further help analyze spacial awareness on the model by training my model and using the pre-trained model as a backbone.

* The first configuration is the model used in CIFAR-10. This configuration has minimal spactial attention. A second optimized configuration will also be attempted to increase spacial awareness significantly to view its influence on the model.

In [38]:
#Reimplementation Model configuration:
cfgs_reImp = [
    # stage1
    [[3, 16, 16, 0, 1, 0]],
    # stage2
    [[3, 48, 24, 0, 2, 0]],
    [[3, 72, 24, 0, 1, 0]],
    # stage3
    [[5, 72, 40, 0.25, 2, 5]],
    [[5, 120, 40, 0.25, 1, 7]],
    # stage4
    [[3, 240, 80, 0, 2, 0]],
    [[3, 200, 80, 0, 1, 0],
     [3, 184, 80, 0, 1, 0],
     [3, 184, 80, 0, 1, 0],
     [3, 480, 112, 0.25, 1, 5],
     [3, 672, 112, 0.25, 1, 5]
     ],
    # stage5
    [[5, 672, 160, 0.25, 2, 3]],
    [[5, 960, 160, 0, 1, 0],
     [5, 960, 160, 0.25, 1, 3],
     [5, 960, 160, 0, 1, 0],
     [5, 960, 160, 0.25, 1, 3]
     ]
]

#Optimized CBAM Spacial Attention Configuration:
cfgs_cbam_opt = [
    # stage1 (No CBAM)
    [[3, 16, 16, 0, 1, 0]],

    # stage2 (No CBAM)
    [[3, 48, 24, 0, 2, 0]],
    [[3, 72, 24, 0, 1, 0]],

    # stage3 (Enable CBAM with small kernel)
    [[5, 72, 40, 0.25, 2, 5]],
    [[5, 120, 40, 0.25, 1, 5]],

    # stage4 (Enable CBAM with increasing kernels)
    [[3, 240, 80, 0.25, 2, 5]],
    [[3, 200, 80, 0.25, 1, 5],
     [3, 184, 80, 0.25, 1, 5],
     [3, 184, 80, 0.25, 1, 5],
     [3, 480, 112, 0.25, 1, 7],
     [3, 672, 112, 0.25, 1, 7]
     ],

    # stage5 (Heaviest CBAM — larger kernel size)
    [[5, 672, 160, 0.25, 2, 7]],
    [[5, 960, 160, 0.25, 1, 7],
     [5, 960, 160, 0.25, 1, 7],
     [5, 960, 160, 0.25, 1, 7],
     [5, 960, 160, 0.25, 1, 7]
     ]
]

#Set up reImp Model for ImageNet Configuration and set to GPU:
from ghostnetN1 import GhostNet_N  # Import Reimplementation Model
model_reImp_ImageNet= GhostNet_N(cfgs_cbam_opt, num_classes=1000, width=1.0, dropout=0.2)
model_reImp_ImageNet.to(device)

GhostNet_N(
  (conv_stem): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (blocks): Sequential(
    (0): Sequential(
      (0): GhostBottleneck(
        (ghost1): GhostModule(
          (primary_conv): Sequential(
            (0): Conv2d(16, 8, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
          )
          (cheap_operation): Sequential(
            (0): Conv2d(8, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=8, bias=False)
            (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
          )
        )
        (ghost2): GhostModule(
          (primary_conv): Sequential(
            (0): Conv2d(16, 8, kernel_size=(1, 

# Initialize the pre-trained weights from the original model into the Reimplementation model:

In [39]:
#Load in the stat_dict from the original model for any layers that are similiar:
model_reImp_ImageNet.load_state_dict(model_orig_ImageNet.state_dict(), strict=False)

#Freeze the first layer of the Reimplementation model as they are the same and won't contain spacial attention:
for name, param in model_reImp_ImageNet.named_parameters():
    if name.startswith("conv_stem") or name.startswith("bn1"):
        param.requires_grad = False

#Train the Reimplementation Model using the state_dict of the Original model:
criterion = nn.CrossEntropyLoss()

# Path for saving and loading checkpoints of the Reimplementation:
# Path for minimal spacial model: "/content/drive/MyDrive/ColabNotebooks/GhostNet/ghostnet_cbam_ImageNet1.pth"
reImp_checkpoint_path = "/content/drive/MyDrive/ColabNotebooks/GhostNet/ghostnet_cbam_ImageNet_CBAM_opt.pth"

start = time.time()
start_epoch = 0         #NOTE: Accuracy was 57% for epoch 25 for "/content/drive/MyDrive/ColabNotebooks/GhostNet/ghostnet_cbam_ImageNet1.pth"
                        #NOTE: Accuracy was 48% for epoch 20 for "/content/drive/MyDrive/ColabNotebooks/GhostNet/ghostnet_cbam_ImageNet2.pth"
                        #NOTE: Accuracy was 58% for epoch 20 for "/content/drive/MyDrive/ColabNotebooks/GhostNet/ghostnet_cbam_ImageNet3.pth" (Lower Learning Rate)
                        #NOTE: Accuracy was 57% for epoch 30 for "/content/drive/MyDrive/ColabNotebooks/GhostNet/ghostnet_cbam_ImageNet3.pth", (I switched to ADAM for last 5 epoch)
                        #Note: I tried to optimize the impact of CBAM on spatial attention. After 20 epochs I got 47%. This shows for a lighter model it is inefficient. "/content/drive/MyDrive/ColabNotebooks/GhostNet/ghostnet_cbam_ImageNet_CBAM_opt.pth"
max_epoch = 20

#Set up the Classifier Function and optimizer:
classifier = model_reImp_ImageNet
optimizer = optim.SGD(classifier.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

for epoch in range(start_epoch, max_epoch):
    print(f"\n Epoch #:{epoch+1}/{max_epoch}")

    #Call the Train Function on the reImp model:
    train(classifier, criterion, optimizer, train_loader, epoch, reImp_checkpoint_path)
    scheduler.step()

end = time.time()
print(f'Finished Training after {end-start} s ')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing batch 532/543...
Processing batch 533/543...
Processing batch 534/543...
Processing batch 535/543...
Processing batch 536/543...
Processing batch 537/543...
Processing batch 538/543...
Processing batch 539/543...
Processing batch 540/543...
Processing batch 541/543...
Processing batch 542/543...
Processing batch 543/543...
Saving checkpoint for epoch 11...

 Epoch #:12/20
Loading checkpoint from /content/drive/MyDrive/ColabNotebooks/GhostNet/ghostnet_cbam_ImageNet_CBAM_opt.pth...
Resuming from epoch 11
About to start training loop...
Processing batch 1/543...
Processing batch 2/543...
Processing batch 3/543...
Processing batch 4/543...
Processing batch 5/543...
Processing batch 6/543...
Processing batch 7/543...
Processing batch 8/543...
Processing batch 9/543...
Processing batch 10/543...
Processing batch 11/543...
Processing batch 12/543...
Processing batch 13/543...
Processing batch 14/543...
Processing batc

# Test the Transfer Learning Model for ReImplementation:

In [42]:
#Test the transfer learning model on ImageNet of the ReImplementation Model:
#Note, this section and the previous can be rerun with different model configurations saved to model_reImp_ImageNet and different checkpoint paths for testing
criterion = nn.CrossEntropyLoss()
epoch = 20
test_accuracy_reImp_ImageNet = test(model_reImp_ImageNet,criterion, val_loader, epoch)

Test result on epoch 21: total sample: 3923, Avg loss: 2.669, Acc: 46.750%
