In [None]:
# Using AI as well

# Improved Neural Network (from scratch) 
This notebook builds on the basic version and adds:

- ReLU activation for hidden layers
- Xavier (Glorot) initialization
- Mini-batch gradient descent
- Adam optimizer implemented from scratch
- Options for multiple hidden layers

Everything still uses NumPy only (no TensorFlow/PyTorch).

In [None]:
# Imports
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

np.random.seed(1)


In [None]:
# Load and preprocess MNIST
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X = mnist['data'].astype(np.float32) / 255.0
y = mnist['target'].astype(np.int64)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=1)
X_train = X_train.T
X_test = X_test.T
m_train = X_train.shape[1]
num_classes = 10
Y_train = np.eye(num_classes)[y_train].T
Y_test = np.eye(num_classes)[y_test].T


In [None]:
# Activations and helpers

def relu(Z):
    return np.maximum(0, Z)

def relu_derivative(Z):
    return (Z > 0).astype(float)

def softmax(Z):
    Z_shift = Z - np.max(Z, axis=0, keepdims=True)
    expZ = np.exp(Z_shift)
    return expZ / np.sum(expZ, axis=0, keepdims=True)

def compute_loss(Y, A):
    A = np.clip(A, 1e-12, 1.0 - 1e-12)
    m = Y.shape[1]
    return - (1.0 / m) * np.sum(Y * np.log(A))

# Xavier init
def xavier_init(size_in, size_out):
    bound = np.sqrt(6.0 / (size_in + size_out))
    return np.random.uniform(-bound, bound, (size_out, size_in))


In [None]:
# Adam optimizer helper

def init_adam(params):
    v = {k: np.zeros_like(v) for k, v in params.items()}
    s = {k: np.zeros_like(v) for k, v in params.items()}
    return v, s

def adam_update(params, grads, v, s, t, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
    v_corrected = {}
    s_corrected = {}
    for key in params.keys():
        v[key] = beta1 * v[key] + (1 - beta1) * grads[key]
        s[key] = beta2 * s[key] + (1 - beta2) * (grads[key] ** 2)

        v_corrected[key] = v[key] / (1 - beta1 ** t)
        s_corrected[key] = s[key] / (1 - beta2 ** t)

        params[key] = params[key] - lr * v_corrected[key] / (np.sqrt(s_corrected[key]) + eps)
    return params, v, s


In [None]:
# Model architecture (2 hidden layers example)
n_x = X_train.shape[0]
layer_sizes = [n_x, 128, 64, num_classes]  # input, hidden1, hidden2, output
params = {}
for i in range(len(layer_sizes)-1):
    params[f'W{i+1}'] = xavier_init(layer_sizes[i], layer_sizes[i+1])
    params[f'b{i+1}'] = np.zeros((layer_sizes[i+1], 1))

# Adam state
v, s = init_adam(params)

# Training hyperparams
learning_rate = 0.001
num_epochs = 20
batch_size = 256
m = m_train

# Helper to get minibatches
def get_mini_batches(X, Y, batch_size):
    m = X.shape[1]
    permutation = np.random.permutation(m)
    X_shuffled = X[:, permutation]
    Y_shuffled = Y[:, permutation]
    mini_batches = []
    num_complete = m // batch_size
    for k in range(num_complete):
        X_batch = X_shuffled[:, k*batch_size:(k+1)*batch_size]
        Y_batch = Y_shuffled[:, k*batch_size:(k+1)*batch_size]
        mini_batches.append((X_batch, Y_batch))
    if m % batch_size != 0:
        X_batch = X_shuffled[:, num_complete*batch_size:]
        Y_batch = Y_shuffled[:, num_complete*batch_size:]
        mini_batches.append((X_batch, Y_batch))
    return mini_batches


In [None]:
# Training loop with mini-batch SGD + Adam
import time
start = time.time()
t = 1
for epoch in range(num_epochs):
    mini_batches = get_mini_batches(X_train, Y_train, batch_size)
    epoch_loss = 0.0
    for X_batch, Y_batch in mini_batches:
        # Forward
        Z1 = params['W1'].dot(X_batch) + params['b1']
        A1 = relu(Z1)
        Z2 = params['W2'].dot(A1) + params['b2']
        A2 = relu(Z2)
        Z3 = params['W3'].dot(A2) + params['b3']
        A3 = softmax(Z3)

        epoch_loss += compute_loss(Y_batch, A3) * X_batch.shape[1]

        # Backprop
        m_b = X_batch.shape[1]
        dZ3 = A3 - Y_batch
        dW3 = (1.0 / m_b) * dZ3.dot(A2.T)
        db3 = (1.0 / m_b) * np.sum(dZ3, axis=1, keepdims=True)

        dA2 = params['W3'].T.dot(dZ3)
        dZ2 = dA2 * relu_derivative(Z2)
        dW2 = (1.0 / m_b) * dZ2.dot(A1.T)
        db2 = (1.0 / m_b) * np.sum(dZ2, axis=1, keepdims=True)

        dA1 = params['W2'].T.dot(dZ2)
        dZ1 = dA1 * relu_derivative(Z1)
        dW1 = (1.0 / m_b) * dZ1.dot(X_batch.T)
        db1 = (1.0 / m_b) * np.sum(dZ1, axis=1, keepdims=True)

        # Pack grads
        grads = {
            'W1': dW1, 'b1': db1,
            'W2': dW2, 'b2': db2,
            'W3': dW3, 'b3': db3
        }

        # Update params with Adam
        params, v, s = adam_update(params, grads, v, s, t, lr=learning_rate)
        t += 1

    epoch_loss /= m_train
    print(f"Epoch {epoch+1}/{num_epochs}, loss = {epoch_loss:.4f}")

end = time.time()
print('Training time (s):', end-start)


In [None]:
# Evaluation
Z1_test = params['W1'].dot(X_test) + params['b1']
A1_test = relu(Z1_test)
Z2_test = params['W2'].dot(A1_test) + params['b2']
A2_test = relu(Z2_test)
Z3_test = params['W3'].dot(A2_test) + params['b3']
A3_test = softmax(Z3_test)

preds = np.argmax(A3_test, axis=0)
true = y_test

print('Confusion matrix:')
print(confusion_matrix(true, preds))
print('\nClassification report:')
print(classification_report(true, preds))
