In [69]:
import numpy as np

In [70]:
data = np.load("mnist.npz")

# Convert to float32 và reshape thành (num_features, num_samples)
X_train = data["x_train"].astype(np.float32).reshape(60000, 784).T / 255.0
y_train = data["y_train"].astype(int)

X_test = data["x_test"].astype(np.float32).reshape(10000, 784).T / 255.0
y_test = data["y_test"].astype(int)

# Shuffle training set
perm = np.random.permutation(X_train.shape[1])
X_train = X_train[:, perm]
y_train = y_train[perm]

print("X_train shape:", X_train.shape)   # (784, 60000)
print("X_test  shape:", X_test.shape)    # (784, 10000)
print("y_train shape:", y_train.shape)   # (60000,)
print("y_test  shape:", y_test.shape)    # (10000,)

X_train shape: (784, 60000)
X_test  shape: (784, 10000)
y_train shape: (60000,)
y_test  shape: (10000,)


In [71]:
# Kiệt
# =========================
# 1) Softmax function
# =========================
def softmax(z):
    # z shape = (num_classes, num_samples)
    # subtract max trick to avoid overflow
    z = z - np.max(z, axis=0, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

# =========================
# 2) Cross entropy loss
# =========================
def cross_entropy_loss(y_true, y_pred):
    # y_true: one-hot label (num_classes, num_samples)
    # y_pred: prediction softmax (num_classes, num_samples)
    m = y_true.shape[1]
    loss = -np.sum(y_true * np.log(y_pred + 1e-8)) / m
    return loss

# =========================
# 3) Gradient descent update
# =========================
def train_softmax(X, y, num_classes, lr=0.1, epochs=1000):
    # X shape = (num_features, num_samples)
    # y = labels (0..9)
    m = X.shape[1]
    
    # convert label → one-hot
    y_one_hot = np.zeros((num_classes, m))
    y_one_hot[y, np.arange(m)] = 1

    # initialize weights & bias
    W = np.zeros((num_classes, X.shape[0]))
    b = np.zeros((num_classes, 1))

    for epoch in range(epochs):
        # forward
        Z = np.dot(W, X) + b
        A = softmax(Z)

        # compute loss
        loss = cross_entropy_loss(y_one_hot, A)

        # backward derivatives
        dZ = A - y_one_hot                  # (num_classes, m)
        dW = (1/m) * np.dot(dZ, X.T)        # (num_classes, num_features)
        db = (1/m) * np.sum(dZ, axis=1, keepdims=True)

        # update rule
        W -= lr * dW
        b -= lr * db

        if epoch % 100 == 0:
            print(f"Epoch {epoch} — Loss = {loss:.4f}")

    return W, b


In [72]:
def predict(W, b, X):
    Z = np.dot(W, X) + b
    A = softmax(Z)
    return np.argmax(A, axis=0)

def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

num_classes = 10
W, b = train_softmax(X_train, y_train, num_classes, lr=0.1, epochs=1000)

y_pred_test = predict(W, b, X_test)
acc_test = accuracy(y_test, y_pred_test)
print("Test accuracy:", acc_test)

Epoch 0 — Loss = 2.3026
Epoch 100 — Loss = 0.6087
Epoch 200 — Loss = 0.4894
Epoch 300 — Loss = 0.4404
Epoch 400 — Loss = 0.4122
Epoch 500 — Loss = 0.3933
Epoch 600 — Loss = 0.3794
Epoch 700 — Loss = 0.3687
Epoch 800 — Loss = 0.3602
Epoch 900 — Loss = 0.3531
Test accuracy: 0.9096
