In [16]:
import numpy as np

# PIL (Python Imaging Library) is used here via its 'Image' module to load and process images
from PIL import Image

# PyTorch is a deep learning framework. The 'torch' base module provides core functionality like tensors and GPU acceleration
import torch

# 'nn' provides tools for building neural network layers and models
import torch.nn as nn

# 'F' contains functional versions of neural network layers and common functions like activation functions
import torch.nn.functional as F

# 'optim' provides optimization algorithms like SGD, Adam, etc., used for training models
import torch.optim as optim

# torchvision is a PyTorch library that provides datasets, model architectures, and image transformations
import torchvision

# 'transforms' provides common image preprocessing operations like resizing, normalization, and converting images to tensors
import torchvision.transforms as transforms

In [17]:
# Training transform with data augmentation
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),           # Randomly flip the image horizontally with a 50% chance.
    transforms.RandomCrop(32, padding=4),        # Randomly crop the image to 32x32 with up to 4 pixels of padding on each side.
    transforms.ToTensor(),                       # Convert the image from PIL format to a PyTorch tensor.
    transforms.Normalize((0.5, 0.5, 0.5),        # Normalize the image: subtract mean and divide by std for each channel (R, G, B).
                         (0.5, 0.5, 0.5))
])

# Testing transform without augmentation
test_transform = transforms.Compose([
    transforms.ToTensor(),                       # Convert the image from PIL format to a PyTorch tensor.
    transforms.Normalize((0.5, 0.5, 0.5),        # Normalize the image using the same mean and std as training.
                         (0.5, 0.5, 0.5))
])

In [18]:
# Load the CIFAR-10 training dataset
# - root: directory where the dataset will be saved or loaded from
# - train=True: load the training data split
# - transform: preprocessing to apply to each image (e.g., ToTensor, normalization, etc.)
# - download=True: download the dataset if it's not already present
train_data = torchvision.datasets.CIFAR10(
    root = r"C:\Users\isaia\Documents\ImageClassification_project",
    train = True,
    transform = train_transform,
    download = True
)

# Load the CIFAR-10 test dataset (same settings, but with train=False)
test_data = torchvision.datasets.CIFAR10(
    root = r"C:\Users\isaia\Documents\ImageClassification_project",
    train = False,
    transform = test_transform,
    download = True
)

# Create a DataLoader for the training dataset
# - batch_size=32: feed data in mini-batches of 32 images
# - shuffle=True: shuffle the data at each epoch to improve learning
# - num_workers=2: use 2 worker processes to load data in parallel for better performance
train_loader = torch.utils.data.DataLoader(
    train_data,
    batch_size = 32,
    shuffle = True,
    num_workers = 2
)

# Create a DataLoader for the test dataset with the same settings
test_loader = torch.utils.data.DataLoader(
    test_data,
    batch_size = 32,
    shuffle = True,
    num_workers = 2
)

In [19]:
# Retrieves the first sample from the training dataset:
# 'image' gets the transformed image tensor (after applying the defined transforms like ToTensor and Normalize)
# 'label' gets the corresponding integer class label for that image
image, label = train_data[0]

In [20]:
image.size() # 3 channels (RGB), 32 x 32 pixels

torch.Size([3, 32, 32])

In [21]:
class_names = ["plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"] # 0 - 9

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NeuralNet(nn.Module):
    def __init__(self):
        super().__init__()

        # Convolutional layers are used to extract spatial features from images.
        # They slide filters (small weight matrices) over the image to detect patterns like edges, textures, and shapes.

        # 1st convolutional layer:
        # - Takes in 3-channel (RGB) images
        # - Learns 12 filters of size 5x5
        # Each filter outputs a 2D feature map; the result is 12 such maps → shape (out_channels, output_height, output_width)
        # formula: output_size = (input_size - kernel_size) / stride + 1
        # (32 - 5) / 1 + 1 = 28
        self.conv1 = nn.Conv2d(3, 12, 5) # Output: (12, 28, 28)

        # Pooling layer:
        # - Reduces the spatial size by taking the max value in each 2x2 window
        # - Helps reduce computation and makes features more robust to small shifts
        # We'll reuse this layer after each conv layer
        # (28 - 2) / 2 + 1 = 14
        self.pool = nn.MaxPool2d(2, 2) # e.g., (12, 28, 28) → (12, 14, 14)

        # 2nd convolutional layer:
        # - Input: 12 channels
        # - Learns 24 new filters of size 5x5
        # - Further extracts higher-level features from previous feature maps
        # Output: (24, 10, 10), then pooling → (24, 5, 5)
        self.conv2 = nn.Conv2d(12, 24, 5) # Output before pooling: (24, 10, 10)

        # 3rd convolutional layer:
        # - Input: 24 channels
        # - Learns 48 filters of size 3x3
        # - More compact filter to detect fine details and deeper features
        # Output: (48, 3, 3), then pooling → (48, 1, 1)
        self.conv3 = nn.Conv2d(24, 48, 3) # Output before pooling: (48, 3, 3)

        # Dropout layer:
        # - Randomly zeroes some of the elements of the input tensor with probability p
        # - Helps prevent overfitting by adding noise during training
        self.dropout = nn.Dropout(p=0.5)

        # Fully connected layers:
        # - After the conv and pool layers, we flatten the output tensor into a vector
        # - These layers function like a traditional neural network, learning how to map features to class scores

        # fc1: Takes in flattened features (48 feature maps × 1 × 1 pixels) and outputs 120 values
        self.fc1 = nn.Linear(48 * 1 * 1, 120)

        # fc2: Further reduces to 84 values
        self.fc2 = nn.Linear(120, 84)

        # fc3: Final output layer
        # - Outputs 10 scores (logits), one for each class in CIFAR-10
        self.fc3 = nn.Linear(84, 10)

    # how the data will flow through the neural network
    def forward(self, x):
        # Pass input x through the first convolutional layer (self.conv1)
        # conv1 applies 12 filters (kernels) of size 5x5 to the input image,
        # producing 12 feature maps of size 28x28 (assuming input is 32x32 RGB)

        # Apply ReLU activation to introduce non-linearity
        # ReLU replaces negative values with 0 to help the model learn complex patterns

        # Apply 2x2 max pooling with stride 2 to reduce spatial size
        # This halves the height and width: (12, 28, 28) → (12, 14, 14)
        x = self.pool(F.relu(self.conv1(x))) # → (12, 14, 14)

        x = self.pool(F.relu(self.conv2(x))) # → (24, 5, 5)

        x = self.pool(F.relu(self.conv3(x))) # → (48, 1, 1)

        x = torch.flatten(x, 1) # Flatten the tensor starting from dimension 1 (keep batch dimension intact)
        # Suppose x has shape (batch_size, 48, 1, 1) after convolution and pooling
        # This reshapes each image to a 1D vector of length 48 (48*1*1)
        # Final shape becomes (batch_size, 48), ready for the fully connected (linear) layers

        x = self.dropout(F.relu(self.fc1(x))) # Apply dropout after fc1 to reduce overfitting

        x = F.relu(self.fc2(x)) # Pass through the second fully connected layer (fc2), then apply ReLU

        x = self.fc3(x) # Final fully connected layer outputs raw scores (logits) for each of the 10 classes

        return x # Return the output scores (logits) for each class



In [23]:
net = NeuralNet()  # Create an instance of the NeuralNet model, initializing all layers

loss_function = nn.CrossEntropyLoss()  
# Define the loss function to measure how well the model’s predictions match the true labels
# CrossEntropyLoss is suitable for multi-class classification tasks like CIFAR-10

# Create an Adam optimizer to update the model's parameters during training
# - model.parameters(): gets all learnable parameters (weights & biases) in the model
# - lr=0.001: sets the learning rate, controlling how much to update the parameters at each step
# - torch.optim.Adam: uses the Adam optimization algorithm (adaptive moment estimation), which combines ideas from momentum and RMSProp for efficient training
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

# optimizer = optim.SGD(net.parameters(), lr = 0.001, momentum = 0.9)  
# Set up the optimizer to update model parameters using Stochastic Gradient Descent (SGD)
# net.parameters() passes all learnable parameters (weights and biases) to the optimizer
# lr=0.001 sets the learning rate (step size for updates)
# momentum=0.9 helps accelerate training by smoothing updates and reducing oscillations

In [24]:
# This is the training loop for the neural network.
# It runs for 30 epochs, meaning the model sees the full dataset 30 times.
# For each epoch, we loop over mini-batches of training data using train_loader.
# Inside the loop, we unpack each batch into inputs (images) and labels (true classes).
# We reset the gradients accumulated from the previous batch to zero before each update using optimizer.zero_grad().
# The model makes predictions on the inputs using net(inputs).
# We compute the loss between the predictions and the actual labels using CrossEntropyLoss.
# We then perform backpropagation with loss.backward() to compute gradients.
# optimizer.step() updates the model's weights using the gradients and learning rate.
# We accumulate the loss values over the epoch using running_loss.
# At the end of the epoch, we print the average loss over all batches to track progress.

In [25]:
for epoch in range(75):
    print(f"Training epoch {epoch}...")

    running_loss = 0.0

    for i, data in enumerate(train_loader):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = net(inputs)

        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Loss: {running_loss / len(train_loader):.4f}")

Training epoch 0...
Loss: 1.8398
Training epoch 1...
Loss: 1.5715
Training epoch 2...
Loss: 1.4620
Training epoch 3...
Loss: 1.3846
Training epoch 4...
Loss: 1.3343
Training epoch 5...
Loss: 1.2811
Training epoch 6...
Loss: 1.2594
Training epoch 7...
Loss: 1.2350
Training epoch 8...
Loss: 1.2077
Training epoch 9...
Loss: 1.1928
Training epoch 10...
Loss: 1.1734
Training epoch 11...
Loss: 1.1668
Training epoch 12...
Loss: 1.1519
Training epoch 13...
Loss: 1.1413
Training epoch 14...
Loss: 1.1385
Training epoch 15...
Loss: 1.1212
Training epoch 16...
Loss: 1.1177
Training epoch 17...
Loss: 1.1140
Training epoch 18...
Loss: 1.1049
Training epoch 19...
Loss: 1.0942
Training epoch 20...
Loss: 1.0946
Training epoch 21...
Loss: 1.0868
Training epoch 22...
Loss: 1.0814
Training epoch 23...
Loss: 1.0783
Training epoch 24...
Loss: 1.0748
Training epoch 25...
Loss: 1.0727
Training epoch 26...
Loss: 1.0619
Training epoch 27...
Loss: 1.0593
Training epoch 28...
Loss: 1.0582
Training epoch 29...
Los

In [26]:
# saves the trained model’s parameters (weights and biases) to a file called "trained_net.pth".
torch.save(net.state_dict(), r"C:\Users\isaia\Documents\ImageClassification_project\trained_net.pth")

In [27]:
net = NeuralNet()
# loads the saved model
net.load_state_dict(torch.load(r"C:\Users\isaia\Documents\ImageClassification_project\trained_net.pth"))

<All keys matched successfully>

In [28]:
correct = 0  # Total number of correct predictions
total = 0    # Total number of samples tested

net.eval()   # Set the model to evaluation mode (turn off dropout, etc.)

with torch.no_grad():  # Disable gradient tracking for efficiency during inference
    for data in test_loader:  # Loop through the test data in batches
        images, labels = data  # Get input images and ground truth labels

        outputs = net(images)  # Get model predictions (logits) for each class

        _, predicted = torch.max(outputs, 1)  # Select class with highest score for each image

        total += labels.size(0)  # Add batch size to total sample count

        correct += (predicted == labels).sum().item()  # Count how many predictions were correct

accuracy = 100 * correct / total  # Compute accuracy as a percentage

print(f"Accuracy: {accuracy}%")  # Output final accuracy


Accuracy: 69.37%


In [33]:
# Define a new transformation pipeline:
# - Resize image to 32x32 pixels (CIFAR-10 input size)
# - Convert the image to a PyTorch tensor
# - Normalize RGB channels to range [-1, 1] using mean=0.5 and std=0.5
new_transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Define a function to load and preprocess a single image
def load_image(image_path):
    image = Image.open(image_path)        # Open image from disk
    image = new_transform(image)          # Apply the defined transform
    image = image.unsqueeze(0)            # Add batch dimension at 0 index → shape becomes [1, 3, 32, 32]
    return image

# List of image file paths to classify
image_paths = [
    r"C:\Users\isaia\Downloads\dog.jpg", 
    r"C:\Users\isaia\Downloads\airplane.jpg", 
    r"C:\Users\isaia\Downloads\horse.jpg",
    r"C:\Users\isaia\Downloads\deer.jpg",
    r"C:\Users\isaia\Downloads\ship.jpg",
    r"C:\Users\isaia\Downloads\truck.jpg",
    r"C:\Users\isaia\Downloads\bird.jpg",
    r"C:\Users\isaia\Downloads\cat.jpg",
    r"C:\Users\isaia\Downloads\car.jpg",
    r"C:\Users\isaia\Downloads\frog.jpg",
]
# Preprocess each image and store in a list
images = [load_image(img) for img in image_paths]

net.eval()  # Set model to evaluation mode (disables dropout, etc.)

with torch.no_grad():  # Disable gradient tracking (saves memory and computation)
    for image in images:
        output = net(image)                       # Forward pass through the network
        _, predicted = torch.max(output, dim = 1)       # Get index of the highest scoring class, ignore the value, look across columns
        print(f"Prediction: {class_names[predicted.item()]}")  # Print the class name of the prediction


Prediction: dog
Prediction: plane
Prediction: deer
Prediction: deer
Prediction: bird
Prediction: truck
Prediction: bird
Prediction: cat
Prediction: car
Prediction: frog
