In [None]:
import random
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision.datasets import FashionMNIST

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42
set_seed(SEED)

In [None]:
train_dataset = FashionMNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = FashionMNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26.4M/26.4M [00:03<00:00, 7.88MB/s]


Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29.5k/29.5k [00:00<00:00, 138kB/s]


Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 4.42M/4.42M [00:01<00:00, 2.47MB/s]


Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5.15k/5.15k [00:00<00:00, 4.68MB/s]

Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw






In [None]:
train_ratio = 0.9
train_size = int(train_ratio * len(train_dataset))
val_size = len(train_dataset) - train_size

train_subset, val_subset =  random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Train size: {len(train_subset)}")
print(f"Validation size: {len(val_subset)}")
print(f"Test size: {len(test_dataset)}")

Train size: 54000
Validation size: 6000
Test size: 10000


In [None]:
# class MLP(nn.Module):
#     def __init__(self, input_dims, hidden_dims, output_dims):
#         super(MLP, self).__init__()
#         self.layer1 = nn.Linear(input_dims, hidden_dims)
#         self.layer2 = nn.Linear(hidden_dims, hidden_dims)
#         self.layer3 = nn.Linear(hidden_dims, hidden_dims)
#         self.layer4 = nn.Linear(hidden_dims, hidden_dims)
#         self.layer5 = nn.Linear(hidden_dims, hidden_dims)
#         self.layer6 = nn.Linear(hidden_dims, hidden_dims)
#         self.layer7 = nn.Linear(hidden_dims, hidden_dims)
#         self.output = nn.Linear(hidden_dims, output_dims)

#         # # Iterate over all modules in the model
#         # for module in self.modules():
#         #     if isinstance(module, nn.Linear):
#         #         # Initialize weights with a normal distribution (mean=0.0, std=1.0)
#         #         nn.init.normal_(module.weight, mean=0.0, std=1.0)

#         #         # Initialize biases to a constant value of 0.0
#         #         nn.init.constant_(module.bias, 0.0)

#         # for module in self.modules():
#         #     if isinstance(module, nn.Linear):
#         #         nn.init.normal_(module.weight, mean=0.0, std=10.0)
#         #         nn.init.constant_(module.bias, 0.0)

#         for module in self.modules():
#             if isinstance(module, nn.Linear):
#                 nn.init.normal_(module.weight, mean=0.0, std=05.0)
#                 nn.init.constant_(module.bias, 0.0)


#     def forward(self, x):
#         x = nn.Flatten()(x)
#         x = self.layer1(x)
#         x = nn.Sigmoid()(x)
#         x = self.layer2(x)
#         x = nn.Sigmoid()(x)
#         x = self.layer3(x)
#         x = nn.Sigmoid()(x)
#         x = self.layer4(x)
#         x = nn.Sigmoid()(x)
#         x = self.layer5(x)
#         x = nn.Sigmoid()(x)
#         x = self.layer6(x)
#         x = nn.Sigmoid()(x)
#         x = self.layer7(x)
#         x = nn.Sigmoid()(x)
#         out = self.output(x)
#         return out

# class MLP(nn.Module):
#     def __init__(self, input_dims, hidden_dims, output_dims):
#         super(MLP, self).__init__()
#         self.hidden_dims = hidden_dims

#         # Define layers
#         self.layer1 = nn.Linear(input_dims, hidden_dims)
#         self.bn1 = nn.BatchNorm1d(hidden_dims)
#         self.layer2 = nn.Linear(hidden_dims, hidden_dims)
#         self.bn2 = nn.BatchNorm1d(hidden_dims)
#         self.layer3 = nn.Linear(hidden_dims, hidden_dims)
#         self.bn3 = nn.BatchNorm1d(hidden_dims)
#         self.layer4 = nn.Linear(hidden_dims, hidden_dims)
#         self.bn4 = nn.BatchNorm1d(hidden_dims)
#         self.layer5 = nn.Linear(hidden_dims, hidden_dims)
#         self.bn5 = nn.BatchNorm1d(hidden_dims)
#         self.layer6 = nn.Linear(hidden_dims, hidden_dims)
#         self.bn6 = nn.BatchNorm1d(hidden_dims)
#         self.layer7 = nn.Linear(hidden_dims, hidden_dims)
#         self.bn7 = nn.BatchNorm1d(hidden_dims)
#         self.output = nn.Linear(hidden_dims, output_dims)

#         # Initialize weights and biases
#         for module in self.modules():
#             if isinstance(module, nn.Linear):
#                 nn.init.normal_(module.weight, mean=0.0, std=0.05)
#                 nn.init.constant_(module.bias, 0.0)

#     def forward(self, x):
#         # Flatten the input
#         x = nn.Flatten()(x)

#         # Forward pass through each layer with BatchNorm and activation
#         x = self.bn1(self.layer1(x))
#         x = nn.Sigmoid()(x)

#         x = self.bn2(self.layer2(x))
#         x = nn.Sigmoid()(x)

#         x = self.bn3(self.layer3(x))
#         x = nn.Sigmoid()(x)

#         x = self.bn4(self.layer4(x))
#         x = nn.Sigmoid()(x)

#         x = self.bn5(self.layer5(x))
#         x = nn.Sigmoid()(x)

#         x = self.bn6(self.layer6(x))
#         x = nn.Sigmoid()(x)

#         x = self.bn7(self.layer7(x))
#         x = nn.Sigmoid()(x)

#         # Output layer
#         out = self.output(x)
#         return out

# RuntimeError: Expected all tensors to be on the same device, but found at
# least two devices, cuda:0 and cpu! (when checking argument for argument
# weight in method wrapper_CUDA__native_batch_norm)

# class MyNormalization(nn.Module):
#     def __init__(self):
#         super(MyNormalization, self).__init__()

#     def forward(self, x):
#         # Calculate mean and standard deviation of the input
#         mean = torch.mean(x, dim=0, keepdim=True)  # Calculate along batch dimension
#         std = torch.std(x, dim=0, keepdim=True)
#         # Normalize the input
#         return (x - mean) / (std + 1e-5)  # Add epsilon for numerical stability

# class MLP(nn.Module):
#     def __init__(self, input_dims, hidden_dims, output_dims):
#         super(MLP, self).__init__()
#         self.hidden_dims = hidden_dims

#         # Define layers and normalization layers
#         self.layer1 = nn.Linear(input_dims, hidden_dims)
#         self.norm1 = MyNormalization()

#         self.layer2 = nn.Linear(hidden_dims, hidden_dims)
#         self.norm2 = MyNormalization()

        # self.layer3 = nn.Linear(hidden_dims, hidden_dims)
        # self.norm3 = MyNormalization()

        # self.layer4 = nn.Linear(hidden_dims, hidden_dims)
        # self.norm4 = MyNormalization()

        # self.layer5 = nn.Linear(hidden_dims, hidden_dims)
        # self.norm5 = MyNormalization()

        # self.layer6 = nn.Linear(hidden_dims, hidden_dims)
        # self.norm6 = MyNormalization()

        # self.layer7 = nn.Linear(hidden_dims, hidden_dims)
        # self.norm7 = MyNormalization()

        # self.output = nn.Linear(hidden_dims, output_dims)

        # Initialize weights and biases
        # for module in self.modules():
        #     if isinstance(module, nn.Linear):
        #         nn.init.normal_(module.weight, mean=0.0, std=0.05)
        #         nn.init.constant_(module.bias, 0.0)

    # def forward(self, x):
    #     # Flatten the input
        # x = nn.Flatten()(x)

        # # Forward pass through layers with custom normalization and activation
        # x = self.norm1(self.layer1(x))
        # x = nn.Sigmoid()(x)

        # x = self.norm2(self.layer2(x))
        # x = nn.Sigmoid()(x)

        # x = self.norm3(self.layer3(x))
        # x = nn.Sigmoid()(x)

        # x = self.norm4(self.layer4(x))
        # x = nn.Sigmoid()(x)

        # x = self.norm5(self.layer5(x))
        # x = nn.Sigmoid()(x)

        # x = self.norm6(self.layer6(x))
        # x = nn.Sigmoid()(x)

        # x = self.norm7(self.layer7(x))
        # x = nn.Sigmoid()(x)

        # # Output layer
        # out = self.output(x)
        # return out

class MLP(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(MLP, self).__init__()

        # Define layers
        self.layer1 = nn.Linear(input_dims, hidden_dims)
        self.layer2 = nn.Linear(hidden_dims, hidden_dims)
        self.layer3 = nn.Linear(hidden_dims, hidden_dims)
        self.layer4 = nn.Linear(hidden_dims, hidden_dims)
        self.layer5 = nn.Linear(hidden_dims, hidden_dims)
        self.layer6 = nn.Linear(hidden_dims, hidden_dims)
        self.layer7 = nn.Linear(hidden_dims, hidden_dims)
        self.output = nn.Linear(hidden_dims, output_dims)

        # Initialize weights and biases
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, mean=0.0, std=0.05)
                nn.init.constant_(module.bias, 0.0)

    def forward(self, x):
        # Flatten the input
        x = nn.Flatten()(x)

        # Layer 1 with activation
        x = self.layer1(x)
        x = nn.Sigmoid()(x)
        skip = x  # Save for residual connection

        # Layers 2 and 3 with residual connection
        x = self.layer2(x)
        x = nn.Sigmoid()(x)
        x = self.layer3(x)
        x = nn.Sigmoid()(x)
        x = skip + x  # Add skip connection

        # Layer 4 with activation
        x = self.layer4(x)
        x = nn.Sigmoid()(x)
        skip = x  # Save for residual connection

        # Layers 5, 6, and 7 with residual connection
        x = self.layer5(x)
        x = nn.Sigmoid()(x)
        x = self.layer6(x)
        x = nn.Sigmoid()(x)
        x = self.layer7(x)
        x = nn.Sigmoid()(x)
        x = skip + x  # Add skip connection

        # Output layer
        out = self.output(x)
        return out

# Parameters
input_dims = 784
hidden_dims = 128
output_dims = 10
lr = 1e-3

# Model Initialization
model = MLP(
    input_dims=input_dims,
    hidden_dims=hidden_dims,
    output_dims=output_dims
).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
epochs = 100

# Metrics storage
train_loss_lst = []
train_acc_lst = []
val_loss_lst = []
val_acc_lst = []

# Training and Validation Loop
for epoch in range(epochs):
    train_loss = 0.0
    train_acc = 0.0
    count = 0

    # Training Phase
    model.train()
    for X_train, y_train in train_loader:
        # Move data to device
        X_train, y_train = X_train.to(device), y_train.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(X_train)
        loss = criterion(outputs, y_train)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Track training loss and accuracy
        train_loss += loss.item()
        train_acc += (torch.argmax(outputs, 1) == y_train).sum().item()
        count += len(y_train)

    # Average training metrics
    train_loss /= len(train_loader)
    train_loss_lst.append(train_loss)
    train_acc /= count
    train_acc_lst.append(train_acc)

    # Validation Phase
    val_loss = 0.0
    val_acc = 0.0
    count = 0
    model.eval()
    with torch.no_grad():
        for X_val, y_val in val_loader:
            # Move data to device
            X_val, y_val = X_val.to(device), y_val.to(device)

            # Forward pass
            outputs = model(X_val)
            loss = criterion(outputs, y_val)

            # Track validation loss and accuracy
            val_loss += loss.item()
            val_acc += (torch.argmax(outputs, 1) == y_val).sum().item()
            count += len(y_val)

    # Average validation metrics
    val_loss /= len(val_loader)
    val_loss_lst.append(val_loss)
    val_acc /= count
    val_acc_lst.append(val_acc)

    # Log epoch metrics
    print(f"EPOCH {epoch+1}/{epochs}, "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
          f"Validation Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

EPOCH 1/100, Train Loss: 0.6637, Train Acc: 0.8021, Validation Loss: 0.5315, Val Acc: 0.8197
EPOCH 2/100, Train Loss: 0.4744, Train Acc: 0.8351, Validation Loss: 0.4790, Val Acc: 0.8335
EPOCH 3/100, Train Loss: 0.4471, Train Acc: 0.8431, Validation Loss: 0.4481, Val Acc: 0.8422
EPOCH 4/100, Train Loss: 0.4264, Train Acc: 0.8476, Validation Loss: 0.4410, Val Acc: 0.8427
EPOCH 5/100, Train Loss: 0.4082, Train Acc: 0.8544, Validation Loss: 0.4290, Val Acc: 0.8468
EPOCH 6/100, Train Loss: 0.3929, Train Acc: 0.8599, Validation Loss: 0.4170, Val Acc: 0.8557
EPOCH 7/100, Train Loss: 0.3864, Train Acc: 0.8618, Validation Loss: 0.4145, Val Acc: 0.8560


In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12, 10))

# Training Loss
ax[0, 0].plot(train_loss_lst, color='green')
ax[0, 0].set(xlabel='Epoch', ylabel='Loss')
ax[0, 0].set_title('Training Loss')

# Validation Loss
ax[0, 1].plot(val_loss_lst, color='orange')
ax[0, 1].set(xlabel='Epoch', ylabel='Loss')
ax[0, 1].set_title('Validation Loss')

# Training Accuracy
ax[1, 0].plot(train_acc_lst, color='green')
ax[1, 0].set(xlabel='Epoch', ylabel='Accuracy')
ax[1, 0].set_title('Training Accuracy')

# Validation Accuracy
ax[1, 1].plot(val_acc_lst, color='orange')
ax[1, 1].set(xlabel='Epoch', ylabel='Accuracy')
ax[1, 1].set_title('Validation Accuracy')

# Display the plot
plt.show()

In [None]:
# Initialize lists for storing predictions and targets
test_target = []
test_predict = []

# Set model to evaluation mode
model.eval()

# Disable gradient computation for testing
with torch.no_grad():
    for X_test, y_test in test_loader:
        # Move data to the appropriate device (GPU/CPU)
        X_test = X_test.to(device)
        y_test = y_test.to(device)

        # Forward pass
        outputs = model(X_test)

        # Collect predictions and targets
        test_predict.append(outputs.cpu())
        test_target.append(y_test.cpu())

# Concatenate all predictions and targets
test_predict = torch.cat(test_predict)
test_target = torch.cat(test_target)

# Compute accuracy
test_acc = (torch.argmax(test_predict, 1) == test_target).sum().item() / len(test_target)

# Print evaluation results
print('Evaluation on test set:')
print(f'Accuracy: {test_acc:.4f}')