In [1]:
# Loading dataset
from tensorflow.keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [2]:
# Device Agnostic code
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.get_device_name()

'NVIDIA GeForce MX350'

In [3]:
# Dataset class
from torch.utils.data import Dataset, DataLoader

class CustomMNIST(Dataset):
    def __init__(self, features, labels):
        super().__init__()
        self.features = torch.from_numpy(features).to(torch.float32) # NOTE: Datatype of features must be in float format
        self.labels = torch.from_numpy(labels).to(torch.long) # NOTE: Datatype of labels must be in long format (required for CrossEntropyLoss)

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

In [4]:
# Dataset objects
train_df = CustomMNIST(features=X_train, labels=y_train)
test_df = CustomMNIST(features=X_test, labels=y_test)

In [5]:
# Dataloader for train and test datasets
train_loader = DataLoader(
    dataset=train_df,
    batch_size=32,
    shuffle=True,
    drop_last=False
)

test_loader = DataLoader(
    dataset=test_df,
    batch_size=32,
    shuffle=True,
    drop_last=False
)

In [6]:
# Model Building 
from torch import nn
from torch.nn import Module

class CustomModel(Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=784, out_features=32),
            nn.GELU(),
            nn.Linear(in_features=32, out_features=16),
            nn.GELU(),
            nn.Linear(in_features=16, out_features=10)
        )

        self._initialize_weights()
    
    def _initialize_weights(self):
        """Initialize weights using He initialization for ReLU networks"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

    def forward(self, features):
        return self.model(features)

In [7]:
# Defining parameters
learning_rate = 0.001
epochs = 10

In [8]:
# Model object and shifting to cuda
model = CustomModel().to(device)

In [9]:
# Defining loss functions
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr = learning_rate, betas=(0.9, 0.999), weight_decay = 0.01)

In [10]:
# Training loop
for epoch in range(epochs):
    total_loss = 0
    for batch_features, batch_labels in train_loader:
        # Shifting to GPU
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

        # Forward propogation
        y_pred = model(batch_features)

        # Calculate loss
        loss = criterion(y_pred, batch_labels)

        # Backpropogation
        optimizer.zero_grad()
        loss.backward()

        # Update Gradients
        optimizer.step()

        # Calculating loss
        total_loss += loss.item()
    
    avg_loss = total_loss/len(train_loader)
    print(f'Epoch: {epoch + 1} , Loss: {avg_loss}')

Epoch: 1 , Loss: 30.247188794581096
Epoch: 2 , Loss: 2.2808089444478354
Epoch: 3 , Loss: 2.226473060798645
Epoch: 4 , Loss: 2.1806199242273965
Epoch: 5 , Loss: 2.1594092527389526
Epoch: 6 , Loss: 2.0873604164123534
Epoch: 7 , Loss: 2.041416114807129
Epoch: 8 , Loss: 1.0762319830497107
Epoch: 9 , Loss: 0.4404040053407351
Epoch: 10 , Loss: 0.2947535122613112


In [11]:
# Enabling evaluation mode
model.eval()

CustomModel(
  (model): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=32, bias=True)
    (2): GELU(approximate='none')
    (3): Linear(in_features=32, out_features=16, bias=True)
    (4): GELU(approximate='none')
    (5): Linear(in_features=16, out_features=10, bias=True)
  )
)

In [14]:
# evaluation on test data
total = 0
correct = 0

with torch.no_grad():
  for batch_features, batch_labels in test_loader:
    # Shift data over GPU's
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    # Calculating predictions
    outputs = model(batch_features)

    # Predicted classes
    _, predicted = torch.max(outputs, 1)

    total += batch_labels.shape[0]
    correct += (predicted == batch_labels).sum().item()

print(correct / total)

0.9268
