In [15]:
import numpy as np
from keras.datasets import mnist
from keras.utils import to_categorical
import matplotlib.pyplot as plt

import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import SGD


#### Equations for Forward Propagation:

1. **Input to Hidden Layer:**
   $$ Z^{[1]} = X \cdot W^{[1]} + b^{[1]} $$
   $$ A^{[1]} = \text{ReLU}(Z^{[1]}) $$

2. **Hidden Layer to Output:**
   $$ Z^{[2]} = A^{[1]} \cdot W^{[2]} + b^{[2]} $$
   $$ A^{[2]} = \text{softmax}(Z^{[2]}) $$

#### Equations for Backward Propagation:

1. **Output Layer:**
   $$ dZ^{[2]} = A^{[2]} - Y $$
   $$ dW^{[2]} = \frac{1}{m} \cdot A^{[1]T} \cdot dZ^{[2]} $$
   $$ db^{[2]} = \frac{1}{m} \cdot \text{np.sum}(dZ^{[2]}, axis=0, keepdims=True) $$

2. **Hidden Layer:**
   $$ dZ^{[1]} = (dZ^{[2]} \cdot W^{[2]T}) * \text{ReLU}'(Z^{[1]}) $$
   $$ dW^{[1]} = \frac{1}{m} \cdot X^T \cdot dZ^{[1]} $$
   $$ db^{[1]} = \frac{1}{m} \cdot \text{np.sum}(dZ^{[1]}, axis=0, keepdims=True) $$

#### Parameter Update (Gradient Descent):
   $$ W^{[l]} = W^{[l]} - \alpha \cdot dW^{[l]} $$
   $$ b^{[l]} = b^{[l]} - \alpha \cdot db^{[l]} $$


#### Scratch Implementation 

In [18]:
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        # Initialize weights and biases
        self.W1 = np.random.randn(self.input_size, self.hidden_size)
        self.b1 = np.zeros((1, self.hidden_size))
        self.W2 = np.random.randn(self.hidden_size, self.output_size)
        self.b2 = np.zeros((1, self.output_size))
    
    def relu(self, x):
        return np.maximum(0, x)
    
    def softmax(self, x):
        exp_scores = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    
    def forward_propagation(self, X):
        # Input to Hidden Layer
        self.Z1 = np.dot(X, self.W1) + self.b1
        self.A1 = self.relu(self.Z1)
        
        # Hidden Layer to Output
        self.Z2 = np.dot(self.A1, self.W2) + self.b2
        self.A2 = self.softmax(self.Z2)
    
    def backward_propagation(self, X, y):
        m = X.shape[0]
        
        # Output Layer
        dZ2 = self.A2 - y
        dW2 = (1 / m) * np.dot(self.A1.T, dZ2)
        db2 = (1 / m) * np.sum(dZ2, axis=0, keepdims=True)
        
        # Hidden Layer
        dZ1 = np.dot(dZ2, self.W2.T) * (self.Z1 > 0).astype(float)
        dW1 = (1 / m) * np.dot(X.T, dZ1)
        db1 = (1 / m) * np.sum(dZ1, axis=0, keepdims=True)
        
        # Update weights and biases
        self.W1 -= self.learning_rate * dW1
        self.b1 -= self.learning_rate * db1
        self.W2 -= self.learning_rate * dW2
        self.b2 -= self.learning_rate * db2
    
    def train(self, X, y, epochs=200):
        for epoch in range(epochs):
            # Forward propagation
            self.forward_propagation(X)
            
            # Backward propagation
            self.backward_propagation(X, y)
            
            # Compute loss for the current epoch
            loss = self.compute_loss(X, y)
            
            # Print loss for the current epoch
            print(f'Epoch {epoch}: Loss {loss:.4f}')
    
    def compute_loss(self, X, y):
        m = X.shape[0]
        y_pred = np.argmax(self.A2, axis=1)  # Convert one-hot encoded predictions to integer labels
        log_probs = -np.log(self.A2[range(m), y_pred])  # Use y_pred instead of y
        loss = np.sum(log_probs) / m
        return loss
    
    def predict(self, X):
        self.forward_propagation(X)
        return np.argmax(self.A2, axis=1)

In [19]:
# Load MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Flatten images
X_train_flat = X_train.reshape(X_train.shape[0], -1) / 255.0
X_test_flat = X_test.reshape(X_test.shape[0], -1) / 255.0

# One-hot encode labels
y_train_onehot = np.eye(10)[y_train]
y_test_onehot = np.eye(10)[y_test]

# Define neural network parameters
input_size = X_train_flat.shape[1]
hidden_size = 128
output_size = 10
learning_rate = 0.01

# Create and train the neural network
model = NeuralNetwork(input_size, hidden_size, output_size, learning_rate)
model.train(X_train_flat, y_train_onehot, epochs=200)

# Predictions on test set
predictions = model.predict(X_test_flat)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy on test set:", accuracy)

Epoch 0: Loss 0.0122
Epoch 1: Loss 0.0161
Epoch 2: Loss 0.0184
Epoch 3: Loss 0.0219
Epoch 4: Loss 0.0239
Epoch 5: Loss 0.0248
Epoch 6: Loss 0.0264
Epoch 7: Loss 0.0271
Epoch 8: Loss 0.0274
Epoch 9: Loss 0.0277
Epoch 10: Loss 0.0284
Epoch 11: Loss 0.0289
Epoch 12: Loss 0.0299
Epoch 13: Loss 0.0303
Epoch 14: Loss 0.0301
Epoch 15: Loss 0.0310
Epoch 16: Loss 0.0314
Epoch 17: Loss 0.0311
Epoch 18: Loss 0.0313
Epoch 19: Loss 0.0312
Epoch 20: Loss 0.0317
Epoch 21: Loss 0.0314
Epoch 22: Loss 0.0318
Epoch 23: Loss 0.0321
Epoch 24: Loss 0.0319
Epoch 25: Loss 0.0316
Epoch 26: Loss 0.0317
Epoch 27: Loss 0.0318
Epoch 28: Loss 0.0321
Epoch 29: Loss 0.0319
Epoch 30: Loss 0.0320
Epoch 31: Loss 0.0323
Epoch 32: Loss 0.0328
Epoch 33: Loss 0.0329
Epoch 34: Loss 0.0328
Epoch 35: Loss 0.0328
Epoch 36: Loss 0.0326
Epoch 37: Loss 0.0324
Epoch 38: Loss 0.0327
Epoch 39: Loss 0.0326
Epoch 40: Loss 0.0326
Epoch 41: Loss 0.0324
Epoch 42: Loss 0.0320
Epoch 43: Loss 0.0315
Epoch 44: Loss 0.0314
Epoch 45: Loss 0.031

#### Keras Implementation

In [10]:

# Load MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Normalize pixel values
X_train = X_train / 255.0
X_test = X_test / 255.0

# Reshape images to flatten them
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

# One-hot encode labels
y_train_onehot = to_categorical(y_train)
y_test_onehot = to_categorical(y_test)

# Define neural network parameters
input_size = X_train_flat.shape[1]
hidden_size = 128
output_size = 10
learning_rate = 0.01
epochs = 200

# Create the neural network model
model = Sequential([
    Flatten(input_shape=X_train_flat.shape[1:]),
    Dense(hidden_size, activation='relu'),
    Dense(output_size, activation='softmax')
])

# Compile the model with SGD optimizer
optimizer = SGD(learning_rate=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_flat, y_train_onehot, epochs=epochs, batch_size=32, verbose=1, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_flat, y_test_onehot)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)




Epoch 1/200


  super().__init__(**kwargs)


[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 851us/step - accuracy: 0.7022 - loss: 1.1323 - val_accuracy: 0.8997 - val_loss: 0.3760
Epoch 2/200
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 938us/step - accuracy: 0.8947 - loss: 0.3846 - val_accuracy: 0.9154 - val_loss: 0.3077
Epoch 3/200
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 814us/step - accuracy: 0.9125 - loss: 0.3162 - val_accuracy: 0.9230 - val_loss: 0.2760
Epoch 4/200
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 818us/step - accuracy: 0.9213 - loss: 0.2862 - val_accuracy: 0.9293 - val_loss: 0.2545
Epoch 5/200
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 816us/step - accuracy: 0.9285 - loss: 0.2598 - val_accuracy: 0.9335 - val_loss: 0.2373
Epoch 6/200
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 828us/step - accuracy: 0.9338 - loss: 0.2433 - val_accuracy: 0.9395 - val_loss: 0.2238
Epoch 7/20

### The Keras Implementation Turned out to be better because of the following

- Optimized backend libraries
- Pre-Build Layers
- Automatic Differentiation
- Built-in Optimization
- Better Starting weights

### Reasons Why the Scratch implementation didnt perform well 

- Hyperparameter Tuning
- Limited Optimization
- Limited Regularization Techniques