In [1]:
import numpy as np
import tensorflow as tf

# Load MNIST and preprocess
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(x_train.shape[0], -1).T / 255.0  # Shape (784, 60000)
x_test = x_test.reshape(x_test.shape[0], -1).T / 255.0

def init_params():
    w1 = np.random.randn(10, 784) * 0.01
    b1 = np.zeros((10, 1))
    w2 = np.random.randn(10, 10) * 0.01
    b2 = np.zeros((10, 1))
    w3 = np.random.randn(10, 10) * 0.01
    b3 = np.zeros((10, 1))
    return w1, b1, w2, b2, w3, b3

def ReLU(Z):
    return np.maximum(0, Z)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return expZ / expZ.sum(axis=0, keepdims=True)

def forward_prop(w1, b1, w2, b2, w3, b3, x):
    z1 = w1 @ x + b1
    a1 = ReLU(z1)
    z2 = w2 @ a1 + b2
    a2 = ReLU(z2)
    z3 = w3 @ a2 + b3
    a3 = softmax(z3)
    return z1, a1, z2, a2, z3, a3

def dReLU(Z):
    return Z > 0

def one_hot(Y):
    one_hot_Y = np.zeros((10, Y.size))
    one_hot_Y[Y, np.arange(Y.size)] = 1
    return one_hot_Y

def backward_prop(X, Y, A1, A2, A3, W2, W3, Z1, Z2, m):
    one_hot_Y = one_hot(Y)
    dZ3 = A3 - one_hot_Y
    dW3 = 1/m * dZ3 @ A2.T
    db3 = 1/m * np.sum(dZ3, axis=1, keepdims=True)
    dZ2 = W3.T @ dZ3 * dReLU(Z2)
    dW2 = 1/m * dZ2 @ A1.T
    db2 = 1/m * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = W2.T @ dZ2 * dReLU(Z1)
    dW1 = 1/m * dZ1 @ X.T
    db1 = 1/m * np.sum(dZ1, axis=1, keepdims=True)
    return dW1, db1, dW2, db2, dW3, db3

def update_params(w1, b1, w2, b2, w3, b3, dw1, db1, dw2, db2, dw3, db3, alpha):
    w1 -= alpha * dw1
    b1 -= alpha * db1
    w2 -= alpha * dw2
    b2 -= alpha * db2
    w3 -= alpha * dw3
    b3 -= alpha * db3
    return w1, b1, w2, b2, w3, b3

def get_predictions(A3):
    return np.argmax(A3, axis=0)

def get_accuracy(predictions, Y):
    return np.mean(predictions == Y)

def gradient_descent(x, y, alpha, iterations):
    w1, b1, w2, b2, w3, b3 = init_params()
    m = x.shape[1]
    for i in range(iterations):
        z1, a1, z2, a2, z3, a3 = forward_prop(w1, b1, w2, b2, w3, b3, x)
        dw1, db1, dw2, db2, dw3, db3 = backward_prop(x, y, a1, a2, a3, w2, w3, z1, z2, m)
        w1, b1, w2, b2, w3, b3 = update_params(w1, b1, w2, b2, w3, b3, dw1, db1, dw2, db2, dw3, db3, alpha)
        if i % 10 == 0:
            predictions = get_predictions(a3)
            acc = get_accuracy(predictions, y)
            print(f"Iteration {i}: Accuracy = {acc:.4f}")
    return w1, b1, w2, b2, w3, b3

2025-05-07 01:54:44.463083: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Run training
alpha = 0.1
iterations = 100
w1, b1, w2, b2, w3, b3 = gradient_descent(x_train, y_train, alpha, iterations)

# Evaluate on test data
_, _, _, _, _, a3_test = forward_prop(w1, b1, w2, b2, w3, b3, x_test)
test_preds = get_predictions(a3_test)
test_acc = get_accuracy(test_preds, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

Iteration 0: Accuracy = 0.0564
Iteration 10: Accuracy = 0.1124
Iteration 20: Accuracy = 0.1124
Iteration 30: Accuracy = 0.1124
Iteration 40: Accuracy = 0.1124
Iteration 50: Accuracy = 0.1124
Iteration 60: Accuracy = 0.1124
Iteration 70: Accuracy = 0.1124
Iteration 80: Accuracy = 0.1124
Iteration 90: Accuracy = 0.1124
Test Accuracy: 0.1135
