In [13]:
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision.datasets import FashionMNIST
from torch.utils.data import DataLoader, random_split

In [14]:
class AdamOptimizer:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None  # First moment estimate
        self.v = None  # Second moment estimate
        self.t = 0     # Timestep

    def update(self, weights, gradients):
        if self.m is None:
            self.m = np.zeros_like(weights)
            self.v = np.zeros_like(weights)

        self.t += 1

        self.m = self.beta1 * self.m + (1 - self.beta1) * gradients
        self.v = self.beta2 * self.v + (1 - self.beta2) * (gradients ** 2)

        m_hat = self.m / (1 - self.beta1 ** self.t)
        v_hat = self.v / (1 - self.beta2 ** self.t)

        weights -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)

        return weights


In [15]:
import numpy as np

class DenseLayer:
    def __init__(self, input_size, output_size,learning_rate):
        self.weights = np.random.randn(input_size, output_size) * 0.01
        self.bias = np.zeros((1, output_size))
        self.learning_rate = learning_rate
        self.weight_optimizer = AdamOptimizer(learning_rate)
        self.bias_optimizer = AdamOptimizer(learning_rate)
    
    def forward(self, X):
        self.input = X
        # Y = XW + b 
        return np.dot(X, self.weights) + self.bias
    
    def backward(self, grad_output):
        grad_input = np.dot(grad_output, self.weights.T)
        
        grad_weights = np.dot(self.input.T, grad_output)
        grad_bias = np.sum(grad_output, axis=0, keepdims=True)

        #update using adam optimizer
        self.weights = self.weight_optimizer.update(self.weights, grad_weights)
        self.bias = self.bias_optimizer.update(self.bias, grad_bias)

        # self.weights -= self.learning_rate * grad_weights
        # self.bias -= self.learning_rate * grad_bias


        
        return grad_input


In [16]:
import numpy as np

class BatchNormalization:
    def __init__(self, units,learning_rate, momentum=0.9, epsilon=1e-5):
        self.gamma = np.ones((1, units))
        self.beta = np.zeros((1, units))
        self.momentum = momentum
        self.epsilon = epsilon
        self.running_mean = None
        self.running_var = None
        self.learning_rate = learning_rate
        # initialize adam optimizer for gamma and beta
        self.gamma_optimizer = AdamOptimizer(learning_rate)
        self.beta_optimizer = AdamOptimizer(learning_rate)
    
    def forward(self, X, training=True):
        if self.running_mean is None:
            self.running_mean = np.mean(X, axis=0)
            self.running_var = np.var(X, axis=0)
        
        if training:
            # axis=0 means we are calculating mean and variance for each feature/units
            batch_mean = np.mean(X, axis=0)
            batch_var = np.var(X, axis=0)

            # X and batch_mean have shape (batch_size, units) and (1, units)
            # we need to broadcast batch_mean to shape (batch_size, units)
            # batch_var+epsilon has shape (1, units), we need to broadcast it to (batch_size, units)
            # self.normalized has shape (batch_size, units)
            self.normalized = (X - batch_mean) / np.sqrt(batch_var + self.epsilon)

            # self_gemma and self_beta have shape (1, units), we need to broadcast them to (batch_size, units)
            # gemma * normalized is element-wise multiplication, 
            # for example: [[1, 2], [3, 4]] * [[5, 6], [7, 8]] = [[5, 12], [21, 32]] 
            # self.out has shape (batch_size, units)
            self.out = self.gamma * self.normalized + self.beta

            # Update running mean and variance
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * batch_mean
            self.running_var = self.momentum * self.running_var + (1 - self.momentum) * batch_var
        else:
            # Use running mean and variance during inference
            self.normalized = (X - self.running_mean) / np.sqrt(self.running_var + self.epsilon)
            self.out = self.gamma * self.normalized + self.beta

        return self.out

    def backward(self, grad_output):
        # Gradients for gamma and beta (trainable parameters)
        grad_gamma = np.sum(grad_output * self.normalized, axis=0, keepdims=True)
        grad_beta = np.sum(grad_output, axis=0, keepdims=True)

        # Compute grad_input before updating gamma and beta
        batch_size = grad_output.shape[0]
        grad_normalized = grad_output * self.gamma
        batch_var = np.var(self.normalized, axis=0, keepdims=True)
        grad_var = np.sum(grad_normalized * (self.normalized * -0.5) * (batch_var + self.epsilon) ** (-1.5), axis=0)
        grad_mean = np.sum(grad_normalized * -1 / np.sqrt(batch_var + self.epsilon), axis=0) + grad_var * np.mean(-2 * self.normalized, axis=0)

        grad_input = grad_normalized / np.sqrt(batch_var + self.epsilon) \
                     + grad_var * 2 * (self.normalized - np.mean(self.normalized, axis=0)) / batch_size \
                     + grad_mean / batch_size

        # Update gamma and beta using Adam optimizers
        self.gamma = self.gamma_optimizer.update(self.gamma, grad_gamma)
        self.beta = self.beta_optimizer.update(self.beta, grad_beta)

        return grad_input


    

In [17]:
class ReLU:
    def forward(self, X):
        self.input = X
        return np.maximum(0, X)
    
    def backward(self, grad_output):
        grad_input = grad_output.copy()
        grad_input[self.input <= 0] = 0  # Only propagate where input > 0
        return grad_input


In [18]:
class Dropout:
    def __init__(self, rate):
        self.rate = rate
    
    def forward(self, X, training=True):
        if training:
            # scale by 1/(1-rate) to ensure the expected value of X remains the same
            # self.mask has the same shape as X and each element is 0 with probability rate or 1 with probability 1-rate 
            self.mask = (np.random.rand(*X.shape) > self.rate) / (1 - self.rate)
            # apply the mask to X , element-wise multiplication
            return X * self.mask
        return X
    
    def backward(self, grad_output):
        return grad_output * self.mask



In [19]:
class Softmax:
    def forward(self, X):
        exps = np.exp(X - np.max(X, axis=1, keepdims=True))
        return exps / np.sum(exps, axis=1, keepdims=True)
    

In [20]:

class NeuralNetwork:
    def __init__(self, input_dim, hidden_dims, output_dim,learning_rate):
        self.layers = []
        
        # Input to hidden layers
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            self.layers.append(DenseLayer(prev_dim, hidden_dim,learning_rate))
            self.layers.append(BatchNormalization(hidden_dim,learning_rate))
            self.layers.append(ReLU())
            self.layers.append(Dropout(rate=0.4))
            prev_dim = hidden_dim
        
        # Hidden to output layer
        self.layers.append(DenseLayer(prev_dim, output_dim,learning_rate))
        self.softmax = Softmax()
    
    def forward(self, X, training=True):
        for layer in self.layers:
            if isinstance(layer, (Dropout, BatchNormalization)):
                X = layer.forward(X, training)
            else:
                X = layer.forward(X)
        return self.softmax.forward(X)
    
    def backward(self, grad_output):
        for layer in reversed(self.layers):
            grad_output = layer.backward(grad_output)




In [21]:
def one_hot_encode(labels, num_classes):
    return np.eye(num_classes)[labels]


In [22]:


def train(model, data_loader):
    for batch, (images, labels) in enumerate(data_loader):
        images = images.view(images.size(0), -1).numpy()  # Flatten 28x28 images
        labels = labels.numpy()

        # Forward pass
        predictions = model.forward(images, training=True)

        # Assuming `num_classes` is 10 for FashionMNIST
        one_hot_labels = one_hot_encode(labels, num_classes=10)

        
        # Calculate loss (e.g., cross-entropy)
        # Compute the cross-entropy loss
        loss = -np.sum(one_hot_labels * np.log(predictions + 1e-9)) / len(labels)


        # Compute gradients and backpropagate
        grad_output = predictions - one_hot_labels  # Gradient for softmax + cross-entropy
        model.backward(grad_output)
        
        # print(f"Batch {batch}, Loss: {loss}")

def evaluate(model, data_loader):
    # TP, TN, FP, FN for each class separately

    tp = np.zeros(10)
    fp = np.zeros(10)
    fn = np.zeros(10)
    total = 0
    correct = 0
    for images, labels in data_loader:
        images = images.view(images.size(0), -1).numpy()
        labels = labels.numpy() 

        # Forward pass
        outputs = model.forward(images, training=False)

        predictions = np.argmax(outputs, axis=1) # Predicted class index 

        
        # Update confusion matrix
        for i in range(len(labels)):
            if predictions[i] == labels[i]:
                tp[labels[i]] += 1 # predictions[i] is the predicted class, and true class is same. so true positive
                correct += 1
            else:
                fp[predictions[i]] += 1 # predictions[i] is the predicted class, but true class is different. so false positive
                fn[labels[i]] += 1  # labels[i] is the true class, but predicted false by the model. so false negative
            total += 1
    
    # precision = tp / (tp + fp)
    # recall = tp / (tp + fn)
    # f1 = 2 * precision * recall / (precision + recall)

    precisions = np.zeros(10)
    recalls = np.zeros(10)
    f1_scores = np.zeros(10)
    for i in range(10):
        precisions[i] = tp[i] / (tp[i] + fp[i])
        recalls[i] = tp[i] / (tp[i] + fn[i])
        f1_scores[i] = 2 * precisions[i] * recalls[i] / (precisions[i] + recalls[i])
    
    macro_f1 = np.mean(f1_scores)
    total_tp = np.sum(tp)
    total_fp = np.sum(fp)
    total_fn = np.sum(fn)

    return macro_f1, total_tp, total_fp, total_fn



**LOAD THE DATASET**

In [23]:


# Define transforms for data normalization and augmentation if desired
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalize grayscale channel
])

# Download and load the FashionMNIST dataset
train_data = FashionMNIST(root='data', train=True, transform=transform, download=True)
test_data = FashionMNIST(root='data', train=False, transform=transform, download=True)

# Split training data into train and validation sets
train_size = int(0.8 * len(train_data))
val_size = len(train_data) - train_size
train_dataset, val_dataset = random_split(train_data, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)


In [24]:
# Model parameters
input_dim = 28 * 28  # Flattened input size (28x28 pixels)
hidden_dims = [128, 64]  # Example hidden layers
output_dim = 10  # FashionMNIST has 10 classes


learning_rates = [0.003, 0.002, 0.001, 0.0005]

for learning_rate in learning_rates:
    print(f"========================================Learning Rate: {learning_rate}=======================================")
    # Initialize model
    model = NeuralNetwork(input_dim, hidden_dims, output_dim,learning_rate)
    epochs = 25
    best_macro_f1 = 0
    best_total_tp = 0
    best_total_fp = 0
    best_total_fn = 0
    for epoch in range(epochs):
        # print(f"Epoch {epoch+1}/{epochs}:-----------------")
        train(model, train_loader)

        
        
        macro_f1, total_tp, total_fp, total_fn = evaluate(model, val_loader)

        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_total_tp = total_tp
            best_total_fp = total_fp
            best_total_fn = total_fn
    print("Best Validation Macro F1=========: ", best_macro_f1)

    print("Testing on test data=======================:")
    macro_f1, total_tp, total_fp, total_fn = evaluate(model, test_loader)
    print(f"Macro F1: {macro_f1}")




Macro F1: 0.8836733216071293
Macro F1: 0.8842963230648312
Macro F1: 0.8845125176626931
Macro F1: 0.8804929739135339
