<a href="https://colab.research.google.com/github/haifeng-jin/Colabs/blob/main/numpy_MLP_backprop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

def initialize_parameters(input_dim, hidden_dim, output_dim):
    params = {
        'W1': np.random.randn(input_dim, hidden_dim).astype("float32") * 0.01,
        'b1': np.zeros((1, hidden_dim)).astype("float32"),
        'W2': np.random.randn(hidden_dim, output_dim).astype("float32") * 0.01,
        'b2': np.zeros((1, output_dim)).astype("float32")
    }
    return params

# Example usage
input_dim = 10  # Number of input features
hidden_dim = 64  # Number of neurons in the hidden layer
output_dim = 10  # Number of output classes
num_samples = 5  # Number of samples

# Dummy data: features, and their true classes
X_dummy = np.random.randn(num_samples, input_dim).astype("float32")

def one_hot_encode(labels, num_classes):
    # Create an array of zeros with shape (number of labels, number of classes)
    one_hot_encoded = np.zeros((labels.size, num_classes))
    # Set the appropriate elements to one
    one_hot_encoded[np.arange(labels.size), labels] = 1
    return one_hot_encoded


# Generate random labels between 0 and 9 for 10 classes
y_dummy_labels = np.random.randint(0, 10, size=num_samples)

# One-hot encode these labels
y_dummy = one_hot_encode(y_dummy_labels, 10)

init_params = initialize_parameters(input_dim, hidden_dim, output_dim)

In [121]:
import copy

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def softmax(x):
    exps = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

def cross_entropy_loss(y_true, y_pred):
    # # Ensure numerical stability and avoid log(0) by adding a small constant
    epsilon = 1e-12
    # Clip predictions to avoid log(0)
    y_pred = np.clip(y_pred, epsilon, 1. - epsilon)
    # Compute the cross-entropy loss
    loss = -np.sum(y_true * np.log(y_pred)) / y_true.shape[0]
    return loss

def forward_pass(X, params):
    Z1 = np.dot(X, params['W1']) + params['b1']
    A1 = relu(Z1)
    Z2 = np.dot(A1, params['W2']) + params['b2']
    A2 = softmax(Z2)
    cache = (X, Z1, A1, Z2, A2)
    return A2, cache

def backward_pass(y_true, cache, params):
    X, Z1, A1, Z2, A2 = cache
    m = y_true.shape[0]

    dZ2 = (A2 - y_true)
    dW2 = A1.T.dot(dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m

    dA1 = dZ2.dot(params['W2'].T)
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = X.T.dot(dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m

    grads = {
        'W1': dW1,
        'b1': db1,
        'W2': dW2,
        'b2': db2
    }
    return grads

def update_parameters(params, gradients, learning_rate=0.1):
    params['W1'] -= learning_rate * gradients['W1']
    params['b1'] -= learning_rate * gradients['b1']
    params['W2'] -= learning_rate * gradients['W2']
    params['b2'] -= learning_rate * gradients['b2']
    return params

params = copy.deepcopy(init_params)

# Train for a few epochs
for epoch in range(10):
    output, cache = forward_pass(X_dummy, params)
    loss = cross_entropy_loss(y_dummy, output)
    gradients = backward_pass(y_dummy, cache, params)
    params = update_parameters(params, gradients)
    print(f"Epoch {epoch}, Loss: {loss}")

Epoch 0, Loss: 2.3016831398010256
Epoch 1, Loss: 2.2907074451446534
Epoch 2, Loss: 2.279914140701294
Epoch 3, Loss: 2.2692851066589355
Epoch 4, Loss: 2.2587650775909425
Epoch 5, Loss: 2.2483367919921875
Epoch 6, Loss: 2.2379981517791747
Epoch 7, Loss: 2.22768874168396
Epoch 8, Loss: 2.2173813819885253
Epoch 9, Loss: 2.2070613384246824


In [123]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, params):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_dim, output_dim)

        self.layer1.weight = nn.Parameter(torch.from_numpy(np.transpose(params["W1"])))
        self.layer2.weight = nn.Parameter(torch.from_numpy(np.transpose(params["W2"])))

        self.layer1.bias = nn.Parameter(torch.from_numpy(params["b1"]))
        self.layer2.bias = nn.Parameter(torch.from_numpy(params["b2"]))
        self.temp_output = None

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        # Softmax is applied in the loss
        return x
# Create the model
model = MLP(input_dim, hidden_dim, output_dim, copy.deepcopy(init_params))

# Dummy data: 5 samples, each with 10 features
X_dummy_torch = torch.from_numpy(X_dummy)
y_dummy_torch = torch.from_numpy(y_dummy)  # True classes

# Loss function and optimizer

optimizer = optim.SGD(model.parameters(), lr=0.1)
criterion = nn.CrossEntropyLoss()

# Train for a few epochs
for epoch in range(10):
    # Forward pass
    logits = model(X_dummy_torch)
    loss = criterion(logits, y_dummy_torch)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch}, Loss: {loss.item()}')


Epoch 0, Loss: 2.3016831398010256
Epoch 1, Loss: 2.2907073974609373
Epoch 2, Loss: 2.279914140701294
Epoch 3, Loss: 2.26928505897522
Epoch 4, Loss: 2.258764934539795
Epoch 5, Loss: 2.2483366966247558
Epoch 6, Loss: 2.2379981517791747
Epoch 7, Loss: 2.227688789367676
Epoch 8, Loss: 2.2173813343048097
Epoch 9, Loss: 2.207061290740967
