In [None]:
#Imports

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from torchvision import datasets
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import StratifiedKFold
import numpy as np
from torch.utils.data import DataLoader, Subset
import seaborn as sns
from sklearn.metrics import confusion_matrix

These are the corresponding imports to grab torch, functional, transforms, and matplot. Dataloader will help to import the images from folders, and subset will create a different subset (with the images and labels) for each folder. Stratified Kfold is the common method of cross-validation. All imports are used here. Seaborn is for later in the confusion matrix.

In [None]:
#Defining the CNN model - PyTorch

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        #I implemented 7 convolutional layers, each with batch normalization
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.conv4 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(128)
        self.conv5 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
        self.bn5 = nn.BatchNorm2d(256)
        self.conv6 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1)
        self.bn6 = nn.BatchNorm2d(512)
        self.conv7 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=1)
        self.bn7 = nn.BatchNorm2d(1024)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(1024 * 1 * 1, 2048) 
        self.fc2 = nn.Linear(2048, 7)
        self.dropout = nn.Dropout(0.3) 

        #This is the forward propagation/run of the model
    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = self.pool(F.relu(self.bn4(self.conv4(x))))
        x = self.pool(F.relu(self.bn5(self.conv5(x))))
        x = self.pool(F.relu(self.bn6(self.conv6(x))))
        x = self.pool(F.relu(self.bn7(self.conv7(x))))
        x = x.view(x.size(0), -1) 
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

In this block is the actual CNN. I created a simple CNN from nn.Module, and added 7 layers, each with batch normalization. I used ReLU alongside dropout at 30%. Then, I did some max pooling, and changed the corresponding channels. Finally, I flattened it, used two fully connected layers, and converted it into 7 outputs for the classes.

In [None]:
#Transforming the training set for better generalization
transform_train = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.RandomHorizontalFlip(p=0.1),
    transforms.RandomRotation(10),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5),
    transforms.RandomGrayscale(p=0.1),
    transforms.RandomPerspective(distortion_scale=0.5, p=0.1),
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.2, scale=(0.02, 0.1)),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

#Validation set doesn't get as much transformation
transform_val = transforms.Compose([
    transforms.Resize((128, 128)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

#Importing the images, specifying folds, paths, labels, etc...
train_dataset = datasets.ImageFolder(root='../Project 1 - Part 1/archive/images/train', transform=transform_train)
val_dataset = datasets.ImageFolder(root='../Project 1 - Part 1/archive/images/validation', transform=transform_val)
num_folds = 5
image_paths = np.array(train_dataset.imgs)  
labels = np.array(train_dataset.targets)  
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

These are the transformations. Given the dataset being a little irregular on behalf of the students, I needed to add a lot of generalization and transformations so that the training set would properly represent the validation set. 

With train/val_dataset, I get the images using ImageFolder, and specify the paths, labels, number of folds in our cross-validation, and each label.

In [None]:
#Ensuring I use CUDA because I will need it!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True  
scaler = torch.amp.GradScaler('cuda') 

#Adding some class weights for better balancing/results due to the dataset
class_counts = [3993, 436, 4103, 7164, 4982, 4938, 3205]
class_weights = torch.FloatTensor([sum(class_counts) / count for count in class_counts]).to(device)

I started by implementing CUDA, which can help me run the model faster (even with CUDA, it takes about 7 hours on my PC). I needed some other forms of safety for the dataset, so I gave them class weights. Then, I loaded the CUDA gradscaler, which again will help with the processing power/efficiency. Now, it is time to run the model.

In [None]:
#Make the empty arrays for losses/accuracies
train_losses, val_losses, train_accuracies, val_accuracies = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(skf.split(image_paths, labels)):
    print(f"Training on fold {fold + 1}/{num_folds}...")

    #Creating the subsets from dataloader
    train_subset = Subset(train_dataset, train_idx)
    val_subset = Subset(train_dataset, val_idx)
    train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=32, shuffle=False)

    #Model details, like including class weights, Adam, LR, weight decay, a scheduler (super helpful), some patience for early stop, and my best loss.
    model = SimpleCNN().to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
    patience = 5
    best_val_loss = float('inf')
    counter = 0

    #Normal loop pretty much
    num_epochs = 50
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            with torch.amp.autocast('cuda'): 
                outputs = model(images)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        #Gets my losses and accuracies for the training set
        train_loss = running_loss / len(train_loader)
        train_acc = 100 * correct / total

        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)

                with torch.amp.autocast('cuda'):
                    outputs = model(images)
                    loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        #Gets my losses and accuracies for the validation set
        val_loss /= len(val_loader)
        val_acc = 100 * correct / total

        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)

        #Early stopping time
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping triggered.")
                break

        scheduler.step()


This is the actual loop.  It goes through each fold from cross and calculates the gradient, I used cross entropy loss function. There is also early stop just in case. Many things are in play here, like weight decay, LR, Adam, a scheduler, and early stop.

In [None]:
timestamp = time.strftime('%Y%m%d_%H%M%S')
checkpoint_filename = f'checkpoint_epoch_{epoch}_{timestamp}.pth'
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': val_loss,
}, checkpoint_filename)


This is where I save the data. Nothing crazy here.

In [None]:
#checkpoint_path = 'checkpoint_epoch_1_20250320_171718.pth'
#checkpoint = torch.load(checkpoint_path)
#model.load_state_dict(checkpoint['model_state_dict'])

Here is my commented out loading function. I don't need to load anything right now so its commented out.

In [None]:
epochs = range(1, len(train_losses) + 1)

#Make some plots!

#Loss plot
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses, label='Training Loss', color='blue')
plt.plot(epochs, val_losses, label='Validation Loss', color='red')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

#Validation plot
plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracies, label='Training Accuracy', color='blue')
plt.plot(epochs, val_accuracies, label='Validation Accuracy', color='red')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

This code lets me use matplot to generate the plots. I can record the losses and accuracies for each one, with training in blue and validation in red.

In [None]:
#Sets the model to evaluation mode and gets some empty arrays
model.eval()
all_preds = []
all_labels = []

#Extends the empty arrays for each image through numpy (needed for my confusion matrix)
with torch.no_grad():
    for images, labels in val_loader:  
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())  
        all_labels.extend(labels.cpu().numpy()) 

#Confusion matrix!
cm = confusion_matrix(all_labels, all_preds)

#Actually plotting the matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=train_dataset.classes, yticklabels=train_dataset.classes)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

This is how I generated the confusion matrix - thank you sklearn!