# Pytorch Fashion MNIST Practice

Though this notebook covers a well documented dataset with plenty of examples covering it, I hope to demonstrate what I have learned so far about pytorch and its architecture.

This notebook only uses pytorch; documentation can be found at https://pytorch.org/

## Import Libraries

In [1]:
import torch # PyTorch
import torch.nn as nn # Neural network module
import torch.optim as optim # Optimization module
import torch.nn.functional as F # Functional module (activation functions, etc.)
from torchvision import datasets, transforms # Datasets and transformations
from torch.utils.data import DataLoader # Data loader

## Create Class for the Model

For this model I decied to use a model that incorporates CNN layers as they are commonly included when analyzing photographs. 

In [2]:
# Create a CNN class for the model
class CNN(nn.Module):
    def __init__(self):
        # Call the constructor of the parent class (nn.Module)
        super(CNN, self).__init__()
        # First convolutional layer: 1 input channel, 32 output channels, 3x3 kernel
        # the 3x3 kernel is a common choice for image processing tasks
        # It is small enough to capture local patterns, but large enough to capture complex patterns
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
        # Batch normalization for the first convolutional layer
        self.bn1 = nn.BatchNorm2d(32)
        # Second convolutional layer: 32 input channels, 64 output channels, 3x3 kernel
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        # Batch normalization for the second convolutional layer
        self.bn2 = nn.BatchNorm2d(64)
        # Third convolutional layer: 64 input channels, 128 output channels, 3x3 kernel
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3)
        # Batch normalization for the third convolutional layer
        self.bn3 = nn.BatchNorm2d(128)
        # Fully connected layer
        # The input size is calculated as follows: 128 channels, 11x11 spatial dimensions
        # The spatial dimensions are calculated as follows: (original_size - kernel_size + 2*padding) / stride + 1
        # Padding is 0 by default, stride is 1 by default
        self.fc1 = nn.Linear(128 * 11 * 11, 256)
        # Dropout layer with a dropout probability of 0.5
        self.dropout = nn.Dropout(0.5)
        # Fully connected layer: input size 256, output size 128
        self.fc2 = nn.Linear(256, 128)
        # Fully connected layer: input size 128, output size 10 (number of classes)
        self.fc3 = nn.Linear(128, 10)
        

    def forward(self, x):
        # Apply first convolutional layer, followed by batch normalization and ReLU activation
        x = F.relu(self.bn1(self.conv1(x)))
        # Apply second convolutional layer, followed by batch normalization and ReLU activation
        x = F.relu(self.bn2(self.conv2(x)))
        # Apply third convolutional layer, followed by batch normalization and leaky_ReLU activation
        # Leaky ReLU is similar to ReLU, but allows a small gradient when the input is negative
        x = F.leaky_relu(self.bn3(self.conv3(x)))
        # Apply max pooling with a 2x2 kernel
        # This reduces the size of the tensor by half in both dimensions (width and height)
        x = F.max_pool2d(x, 2)
        # Flatten the tensor into a vector
        x = x.view(-1, 128 * 11 * 11)
        # Apply first fully connected layer followed by ReLU activation
        x = F.relu(self.fc1(x))
        # Apply dropout layer
        x = self.dropout(x)
        # Apply second fully connected layer followed by Leaky_ReLU activation
        x = F.leaky_relu(self.fc2(x))
        # Apply third fully connected layer and return the result (no activation function)
        x = self.fc3(x)
        return x

## Transformation of the data

In [3]:
# Set up data loaders with data augmentation
# The compose function allows us to chain multiple transformations together
transform = transforms.Compose([
    # The random horizontal flip transformation flips the image horizontally with a 50% probability
    transforms.RandomHorizontalFlip(),
    # The random rotation transformation rotates the image by a random angle between -10 and 10 degrees
    transforms.RandomRotation(10),
    # The to tensor transformation converts the image to a PyTorch tensor
    transforms.ToTensor(),
])

## Download the data

In [4]:
# Load the FashionMNIST dataset
train_data = datasets.FashionMNIST(root='data', train=True, download=True, transform=transform)
test_data = datasets.FashionMNIST(root='data', train=False, download=True, transform=transforms.ToTensor())

# Create data loaders for the training and test datasets
train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)

## Create the Training and Testing Functions

In [5]:
# Initialize the model
model = CNN()

# Set the criterion equal to the cross-entropy loss function
# This function combines the softmax activation function and the negative log-likelihood loss function
criterion = nn.CrossEntropyLoss()

# Set the optimizer equal to Adam with weight decay
optimizer = optim.AdamW(model.parameters(), lr=0.001)

# Set the learning rate scheduler to decrease the learning rate by a factor of 0.1 every 5 epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

In [6]:
# Define the training function
# The function takes the model, data loader, criterion, optimizer, and number of epochs as input
def train(model, train_loader, criterion, optimizer, epochs=5):
    # Set model to training mode
    model.train()  

    # Loop over the number of epochs
    for epoch in range(epochs):
        # Set the running loss to 0 at the beginning of each epoch
        running_loss = 0

        # Loop over the data loader
        for images, labels in train_loader:
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            output = model(images)
            loss = criterion(output, labels)
            
            # Backward pass and optimization
            # The backward function computes the gradient of the loss with respect to the model parameters
            loss.backward()

            # The step function updates the model parameters based on the computed gradients
            optimizer.step()
            
            # Add the loss for the current batch to the running loss
            running_loss += loss.item()
            
        # Print the average loss for the epoch
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}")

In [7]:
# Define the testing function
def test(model, test_loader):
    # Set model to evaluation mode
    # This is necessary because some layers, such as dropout and batch normalization, behave differently during training and testing
    # In testing mode, dropout is disabled and batch normalization uses the running statistics
    # The running statistics are updated during training and used during testing
    model.eval()  

    # Initialize the number of correct predictions and the total number of predictions to 0
    correct = 0
    total = 0

    # Disable gradient calculation during testing
    with torch.no_grad():  
        # Loop over the test data loader
        for images, labels in test_loader:
            # Forward pass
            outputs = model(images)
            # Get the class with the highest score
            # The _ indicates that we are not interested in the values of the scores, only the indices
            _, predicted = torch.max(outputs.data, 1)  
            # Update the total number of predictions
            total += labels.size(0)
            # Update the number of correct predictions
            correct += (predicted == labels).sum().item()
    # Print the accuracy
    print(f'Accuracy: {100 * correct / total}%')

## Train and Test the Model!

In [8]:
# Train the model and test it
train(model, train_loader, criterion, optimizer, epochs=5)
test(model, test_loader)

Epoch 1/5, Loss: 0.6259899374041984
Epoch 2/5, Loss: 0.42411979856585136
Epoch 3/5, Loss: 0.3714698015023142
Epoch 4/5, Loss: 0.3326240563983602
Epoch 5/5, Loss: 0.31046547002788544
Accuracy: 90.65%


## Save the Model for Use in Distribution

In [9]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'model.pth')

Some things I would do to improve the model is add epochs to take advantage of a lower learning rates, add more layers to the network or normalize the data.