In [14]:
from keras.datasets import fashion_mnist
import numpy as np
import matplotlib.pyplot as plt
import wandb

In [15]:
wandb.login(key="b4dc866a06ba17317c20de0d13c1a64cc23096dd")
wandb.init(project="CS23S025-Assignment-1-DA6401-DL", entity="cs23s025-indian-institute-of-technology-madras")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/kanchan/.netrc


In [16]:
(X_train, Y_train), (X_test, Y_test) = fashion_mnist.load_data()

 1. Fashion-MNIST images are 28x28, so input size is 784. The output is 10 classes.

In [None]:
categories = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

# list to hold one sample per class
sample_images = [None] * len(categories)

for img, lbl in zip(X_train, Y_train):
    if sample_images[lbl] is None:
        sample_images[lbl] = img  
    
    if not any(x is None for x in sample_images):
        break

wandb_images = [wandb.Image(img.astype(np.uint8), caption=categories[idx]) for idx, img in enumerate(sample_images)]
wandb.log({"Q1_sampleImageForEachClass": wandb_images})
wandb.finish()

In [19]:
# fig, axes = plt.subplots(2, 5, figsize=(10, 5))
# for ax, img, lbl in zip(axes.flat, sample_images, categories):
#     ax.imshow(img, cmap='gray')
#     ax.set_title(lbl)
#     ax.axis('off')

# plt.tight_layout()
# plt.show()


Next, we need to normalise the data.
Why to normalise ::
1. Faster convergence.
2. Avoids vanishing gradients.
3. Compatible with activation functions.
4. improves Model performance.

we did not normalise above because we were just visualising the images but whenever we use the data set for training, normalisation becomes essential for better performance.

Why are we dividing with 255.0 for normalisation?
-> Fashion-MNIST consists of grayscale images with pixel value ranging from 0 to 255. Dividing by 255 scales all values between 0 and 1.

In [20]:

X_train = X_train / 255.0
X_test = X_test / 255.0


Activation Functions and their corresponding Derivatives

In [None]:
def relu(x):
    return np.maximum(0, x)

def dRelu(x):
    return np.where(x > 0, 1, 0)

def tanh(x):
    return np.tanh(x)

def dTanh(x):
    return 1 - np.tanh(x) ** 2

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def dSigmoid(x):
    s = sigmoid(x)
    return s * (1 - s)

def softmax(x):
    expX = np.exp(x)
    return expX / np.sum(expX)

In [None]:
class Layer:
    # dict<function, derivative>
    activationFunc = {
        'tanh': (tanh, dTanh),
        'sigmoid': (sigmoid, dSigmoid),
        'relu': (relu, dRelu),
        # Derivative for softmax is handled with cross-entropy loss
        'softmax': (softmax, None)  
    }
    
    def __init__(self, inputSize, neuronCount, activation):
        # Xavier initialization for weights to help with gradient flow
        np.random.seed(33)
        sd = np.sqrt(2 / float(inputSize + neuronCount))
        self.w = np.random.normal(0, sd, size=(neuronCount, inputSize))
        self.b = np.zeros((neuronCount, 1))
        self.act, self.dAct = self.activationFunc.get(activation)
        
        # Placeholders for forward pass outputs
        self.a = None  # Pre-activation 
        self.h = None  # Activation 
        
        # Placeholders for gradients computed during backpropagation
        self.da = None  # Gradient w.r.t. pre-activation
        self.dW = None  # Gradient for weights
        self.db = None  # Gradient for bias
        self.dh = None  # Gradient for activation from the next layer

In [None]:
def forwardPropagation(inputData, layers):
    """
      inputData: numpy array of shape (inputDim, 1)
      layers: list of Layer objects ordered from the first hidden to the output layer
      Returns:Output probabilities from the final layer
    """
    numLayers = len(layers)
    # First layer
    layers[0].a = np.dot(layers[0].w, inputData) + layers[0].b
    layers[0].h = layers[0].act(layers[0].a)
    
    #hidden layers
    for j in range(1, numLayers - 1):
        layers[j].a = np.dot(layers[j].w, layers[j-1].h) + layers[j].b
        layers[j].h = layers[j].act(layers[j].a)
    
    # Output layer
    layers[numLayers - 1].a = np.dot(layers[numLayers - 1].w, layers[numLayers - 2].h) + layers[numLayers - 1].b
    layers[numLayers - 1].h = softmax(layers[numLayers - 1].a)
    
    return layers[numLayers - 1].h

In [None]:
def backwardPropagation(trueLabel, yHat, layers, inputData):
    """
Used to compute gradients for each layer.

trueLabel: the true class index (integer)
yHat: predicted output from forwardPropagation (probabilities)
layers: list of Layer objects
inputData: original input data (needed for first layer gradient)
  
  returns : The layers list with updated gradients (dW and db for each layer).
    """
    # one-hot encoded vector for the true label
    oneHot = np.zeros(yHat.shape)
    oneHot[trueLabel] = 1
    
    # For output layer using softmax and cross-entropy loss:
    layers[-1].da = yHat - oneHot  # (yHat - oneHot) is the gradient
    
    # Backpropagate from the output layer to the first hidden layer
    for j in range(len(layers) - 1, 0, -1):

        # gradients for weights and biases for the current layer
        prevActivation = layers[j-1].h  # Activation from previous layer
        layers[j].dW = np.dot(layers[j].da, prevActivation.T)
        layers[j].db = layers[j].da 
        
        # gradient for previous layer's activation
        layers[j-1].dh = np.dot(layers[j].w.T, layers[j].da)
        if layers[j-1].dAct is not None:
            layers[j-1].da = layers[j-1].dh * layers[j-1].dAct(layers[j-1].a)
    
    # gradients for the first layer using the original in data
    layers[0].dW = np.dot(layers[0].da, inputData.T)
    layers[0].db = layers[0].da
    
    return layers