In [4]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [5]:
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X = mnist["data"].astype(np.float32) / 255.0
y = mnist["target"].astype(int).reshape(-1, 1)

encoder = OneHotEncoder(sparse_output=False, categories='auto')
y_encoded = encoder.fit_transform(y)


In [14]:
print(X.shape, y.shape, y_encoded.shape)

(70000, 784) (70000, 1) (70000, 10)


In [12]:
print(X)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state= 42)

In [17]:
input_size = 784
hidden_size = 64
output_size = 10

np.random.seed(42)
W1 = np.random.randn(input_size, hidden_size) * np.sqrt(1.0 / input_size)
b1 = np.zeros((1, hidden_size))

W2 = np.random.randn(hidden_size, output_size) * np.sqrt(1.0 / hidden_size)
b2 = np.zeros((1, output_size))

In [18]:
def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

def softmax(z):
    exp = np.exp(z - np.max(z, axis = 1, keepdims=True))
    return exp / np.sum(exp, axis = 1, keepdims=True)

def cross_entropy(preds, labels):
    return -np.mean(np.sum(labels * np.log(preds + 1e-8), axis = 1))

In [21]:
def train(X, y, epochs = 100, lr = 0.1):
    global W1, b1, W2, b2
    for epoch in range(1, epochs + 1):
        #forward
        Z1 = X @ W1 + b1
        A1 = relu(Z1)
        Z2 = A1 @ W2 + b2
        A2 = softmax(Z2)

        #loss
        loss = cross_entropy(A2, y)

        #backward
        dZ2 = A2 - y
        dW2 = A1.T @ dZ2 / X.shape[0]
        db2 = np.mean(dZ2, axis = 0, keepdims = True)

        dA1 = dZ2 @ W2.T
        dZ1 = dA1 * relu_derivative(Z1)
        dW1 = X.T @ dZ1 / X.shape[0]
        db1 = np.mean(dZ1, axis=0, keepdims = True)

        #update weights
        W1 -= lr * dW1
        W2 -= lr * dW2
        b1 -= lr * db1
        b2 -= lr * db2

        if epoch % 10 == 0 or epoch == 1:
            acc = evaluate(X_test, y_test)
            print(f"Epoch {epoch}: Loss = {loss:.4f}, Val Accuracy = {acc:.4f}")

def evaluate(X, y):
    Z1 = X @ W1 + b1
    A1 = relu(Z1)
    Z2 = A1 @ W2 + b2
    A2 = softmax(Z2)
    preds = np.argmax(A2, axis = 1)
    labels = np.argmax(y, axis = 1)
    return np.mean(preds == labels)


In [29]:
train(X_train, y_train, epochs = 100, lr = 0.1)

Epoch 1: Loss = 0.4054, Val Accuracy = 0.8913
Epoch 10: Loss = 0.3984, Val Accuracy = 0.8923
Epoch 20: Loss = 0.3913, Val Accuracy = 0.8934
Epoch 30: Loss = 0.3849, Val Accuracy = 0.8954
Epoch 40: Loss = 0.3789, Val Accuracy = 0.8965
Epoch 50: Loss = 0.3735, Val Accuracy = 0.8978
Epoch 60: Loss = 0.3685, Val Accuracy = 0.8991
Epoch 70: Loss = 0.3638, Val Accuracy = 0.8999
Epoch 80: Loss = 0.3594, Val Accuracy = 0.9008
Epoch 90: Loss = 0.3553, Val Accuracy = 0.9017
Epoch 100: Loss = 0.3514, Val Accuracy = 0.9026


In [32]:
print(f"Final Accuracy: {evaluate(X_test, y_test) * 100:.4f}%")

Final Accuracy: 90.2571%
