In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch.utils.data import DataLoader

# Define transformations (normalization and conversion to tensors)
transform = transforms.Compose([
    transforms.ToTensor(),  # Converts an image into a PyTorch tensor (values between 0 and 1)
    transforms.Normalize((0.1307,), (0.3081,))  # Standardizes pixel values
])

# Load MNIST dataset
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)

# Create DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=1000, shuffle=False)

# Define CNN model
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)  # First convolution layer
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)  # Second convolution layer
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)  # Max pooling layer
        self.fc1 = nn.Linear(64 * 7 * 7, 128)  # Fully connected layer (input: flattened feature maps)
        self.fc2 = nn.Linear(128, 10)  # Output layer (10 classes for digits 0-9)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # First convolution + ReLU + MaxPool
        x = self.pool(F.relu(self.conv2(x)))  # Second convolution + ReLU + MaxPool
        x = x.view(-1, 64 * 7 * 7)  # Flatten the feature maps
        x = F.relu(self.fc1(x))  # Fully connected layer with ReLU
        x = self.fc2(x)  # Output layer
        return x

# Define the device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, loss function, and optimizer
model = CNN().to(device)
# CNN() is first called to create an instance of the model.
# Then, to(device) is called to move the model's weights to the GPU.
# The model is stored in an object after being moved to the GPU.

criterion = nn.CrossEntropyLoss()  # Loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0  # Variable to store total loss for the epoch
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)  # Move images and labels to the same device as the model

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()  # Clear previous gradients
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model parameters

        running_loss += loss.item()  # Accumulate loss

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")  # Print epoch loss

print("Training complete!")  # Training finished

# Evaluate the model
model.eval()  # Set model to evaluation mode
correct = 0  # Counter for correctly predicted samples
total = 0  # Counter for total samples

with torch.no_grad():  # Disable gradient calculations for evaluation
    for images, labels in test_loader:  # Iterate over test data
        images, labels = images.to(device), labels.to(device)  # Move data to the same device as the model
        outputs = model(images)  # Get model predictions
        _, predicted = torch.max(outputs.detach(), 1)  # Get the class with the highest probability
        total += labels.size(0)  # Update total sample count
        correct += (predicted == labels).sum().item()  # Update correct predictions count

# Inside torch.no_grad(), gradients are not computed, so there's no need to maintain the computation graph.
# Using .detach() or .data on outputs prevents additional memory usage.
# Prevents unnecessary gradient updates.

# Since this runs inside torch.no_grad(), **automatic differentiation (autograd)** is disabled.
# However, using .data may cause unexpected side effects.
# The safer approach is to use outputs.detach().
print(f"Test Accuracy: {100 * correct / total:.2f}%")  # Print test accuracy

# Save the trained model
torch.save(model.state_dict(), "cnn_mnist.pth")

# Load the model
model.load_state_dict(torch.load("cnn_mnist.pth"))
model.eval()


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 53.7MB/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 1.75MB/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 14.6MB/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 6.26MB/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

Epoch [1/5], Loss: 0.1299
Epoch [2/5], Loss: 0.0437
Epoch [3/5], Loss: 0.0281
Epoch [4/5], Loss: 0.0221
Epoch [5/5], Loss: 0.0160
Training complete!
Test Accuracy: 99.05%


  model.load_state_dict(torch.load("cnn_mnist.pth"))


CNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)

In [None]:
#What is transforms.Compose
#transforms.Compose() is a function that chains multiple transformations together.
#It allows us to apply multiple preprocessing steps to an image sequentially.

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])


In [None]:
#What does ToTensor() do?
transforms.ToTensor()

#Converts a PIL image (or NumPy array) into a PyTorch tensor.
#Scales pixel values from [0, 255] → [0, 1] (floating-point numbers).

In [None]:
#Before ToTensor()
##A grayscale image has pixel values from 0 to 255.
[[  0,  128,  255],
 [ 50,  200,  100],
 [ 30,  90,  180]]


In [None]:
#After ToTensor()
##Pixel values are normalized to [0, 1] by dividing by 255
 [[  0.0,   0.5,  1.0],
 [ 0.2,  0.78,  0.39],
 [ 0.12,  0.35,  0.71]]

In [None]:
#What does Normalize((0.1307,), (0.3081,)) do?

#Normalizes the image using mean and standard deviation

In [12]:
import torch
import torch.nn as nn

# Define predictions and actual labels
outputs = torch.tensor([[2.5, 0.3, 0.2], [0.1, 2.2, 1.8]])  # (batch_size=2, num_classes=3)
labels = torch.tensor([0, 1])  # Ground truth labels

# Define loss function (CrossEntropyLoss)
criterion = nn.CrossEntropyLoss()
loss = criterion(outputs, labels)

print(loss)         # tensor(1.1234, grad_fn=<NllLossBackward0>)
print(loss.item())  # 1.1234  (float value)

tensor(0.3876)
0.38763153553009033


In [5]:
#outputs = model(images) → Model Makes Predictions
outputs = model(images)

#The input images (images) are passed through the model (CNN network) to obtain predicted values (outputs).
#outputs is a tensor with shape (batch_size, num_classes).
#For example, if the batch size is 4 and there are 3 classes:
outputs.shape = (4, 3)
#Each row represents the logits (class scores) predicted by the model for each image.

#Example output (outputs)
tensor([[1.2, 2.5, 0.8],  # Class scores for the first image
        [0.9, 1.8, 3.1],  # Class scores for the second image
        [2.1, 0.7, 1.5],  # Class scores for the third image
        [0.4, 1.3, 2.2]]) # Class scores for the fourth image
#Each image has 3 class scores, and the class with the highest score is the predicted label.

0
1
2
3
4


In [None]:
_, predicted = torch.max(outputs.data, 1)

# `torch.max(outputs.data, 1)` returns the index of the class with the highest score for each sample.
# It returns two values:
# `_` → The predicted score (not used here)
# `predicted` → The predicted class index (the class with the highest score)

# Inside `torch.no_grad()`, gradients are not computed, so there is no need to maintain the computation graph.
# Using `.detach()` or `.data` on `outputs` prevents additional memory usage.
# Prevents unnecessary gradient updates.

# Since this runs inside `torch.no_grad()`, **automatic differentiation (autograd)** is disabled.
# However, using `.data` may cause unexpected side effects.
# The safer approach is to use `outputs.detach()`.


In [11]:
labels = torch.tensor([1, 2, 0, 1])   # Actual labels (ground truth)
predicted = torch.tensor([1, 2, 0, 2])  # Model's predicted values

print(predicted == labels)  # tensor([True, True, True, False]) - Comparison result
print((predicted == labels).sum())  # tensor(3) - Number of correct predictions
print((predicted == labels).sum().item())  # 3 (integer value)

tensor([ True,  True,  True, False])
tensor(3)
3
