<a href="https://colab.research.google.com/github/haifeng-jin/Colabs/blob/main/numpy_MLP_backprop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

def initialize_parameters(input_dim, hidden_dim, output_dim):
    params = {
        'W1': np.random.randn(input_dim, hidden_dim).astype("float32") * 0.01,
        'b1': np.zeros((1, hidden_dim)).astype("float32"),
        'W2': np.random.randn(hidden_dim, output_dim).astype("float32") * 0.01,
        'b2': np.zeros((1, output_dim)).astype("float32")
    }
    return params

# Example usage
input_dim = 10  # Number of input features
hidden_dim = 64  # Number of neurons in the hidden layer
output_dim = 10  # Number of output classes
num_samples = 5  # Number of samples

# Dummy data: features, and their true classes
X_dummy = np.random.randn(num_samples, input_dim).astype("float32")

def one_hot_encode(labels, num_classes):
    # Create an array of zeros with shape (number of labels, number of classes)
    one_hot_encoded = np.zeros((labels.size, num_classes))
    # Set the appropriate elements to one
    one_hot_encoded[np.arange(labels.size), labels] = 1
    return one_hot_encoded


# Generate random labels between 0 and 9 for 10 classes
y_dummy_labels = np.random.randint(0, 10, size=num_samples)

# One-hot encode these labels
y_dummy = one_hot_encode(y_dummy_labels, 10)

init_params = initialize_parameters(input_dim, hidden_dim, output_dim)

In [36]:
import copy

def relu(x):
    return np.maximum(0, x)

def softmax(x):
    exps = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

def cross_entropy_loss(y_true, y_pred):
    # # Ensure numerical stability and avoid log(0) by adding a small constant
    epsilon = 1e-12
    # Clip predictions to avoid log(0)
    y_pred = np.clip(y_pred, epsilon, 1. - epsilon)
    # Compute the cross-entropy loss
    loss = -np.sum(y_true * np.log(y_pred)) / y_true.shape[0]
    return loss

def forward_pass(X, params):
    Z1 = np.dot(X, params['W1']) + params['b1']
    A1 = relu(Z1)
    Z2 = np.dot(A1, params['W2']) + params['b2']
    A2 = softmax(Z2)
    cache = (X, Z1, A1, Z2, A2)
    return A2, cache

def backward_pass(y_true, cache, params):
    X, Z1, A1, Z2, A2 = cache
    num_samples = y_true.shape[0]

    def relu_derivative(inputs, gradients):  # dL/dx, y=relu(x), gradients are dL/dy
        return gradients * (inputs > 0).astype(float)

    def weight_derivative(inputs, gradients):  # dL/dW, y=Wx+b, gradients are dL/dy
        return inputs.T.dot(gradients)

    def bias_derivative(gradients): # dL/db, y=Wx+b, gradients are dL/dy
        return np.sum(gradients, axis=0, keepdims=True)

    dZ2 = (A2 - y_true) / num_samples
    dW2 = weight_derivative(inputs=A1, gradients=dZ2)
    db2 = bias_derivative(gradients=dZ2)

    dA1 = dZ2.dot(params['W2'].T)
    dZ1 = relu_derivative(inputs=Z1, gradients=dA1)
    dW1 = weight_derivative(inputs=X, gradients=dZ1)
    db1 = bias_derivative(gradients=dZ1)

    # print(f"dZ2 shape: {dZ2.shape}")
    # print(f"dW2 shape: {dW2.shape}")
    # print(f"db2 shape: {db2.shape}")
    # print(f"dA1 shape: {dA1.shape}")
    # print(f"dZ1 shape: {dZ1.shape}")
    # print(f"dW1 shape: {dW1.shape}")
    # print(f"db1 shape: {db1.shape}")

    grads = {
        'W1': dW1,
        'b1': db1,
        'W2': dW2,
        'b2': db2
    }
    return grads

def update_parameters(params, gradients, learning_rate=0.1):
    params['W1'] -= learning_rate * gradients['W1']
    params['b1'] -= learning_rate * gradients['b1']
    params['W2'] -= learning_rate * gradients['W2']
    params['b2'] -= learning_rate * gradients['b2']
    return params


params = copy.deepcopy(init_params)
# Train for a few epochs
output, cache = forward_pass(X_dummy, params)
loss = cross_entropy_loss(y_dummy, output)
gradients = backward_pass(y_dummy, cache, params)
params = update_parameters(params, gradients)

In [37]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, params):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_dim, output_dim)

        self.layer1.weight = nn.Parameter(torch.from_numpy(np.transpose(params["W1"])))
        self.layer2.weight = nn.Parameter(torch.from_numpy(np.transpose(params["W2"])))

        self.layer1.bias = nn.Parameter(torch.from_numpy(params["b1"]))
        self.layer2.bias = nn.Parameter(torch.from_numpy(params["b2"]))
        self.temp_output = None

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        # Softmax is applied in the loss
        return x



In [38]:
def run():
    params = copy.deepcopy(init_params)
    # Train for a few epochs
    logs = []
    for epoch in range(10):
        output, cache = forward_pass(X_dummy, params)
        loss = cross_entropy_loss(y_dummy, output)
        gradients = backward_pass(y_dummy, cache, params)
        params = update_parameters(params, gradients)
        logs.append(loss)
    print(logs)
    return logs


def run_torch():
    # Create the model
    model = MLP(input_dim, hidden_dim, output_dim, copy.deepcopy(init_params))

    # Dummy data: 5 samples, each with 10 features
    X_dummy_torch = torch.from_numpy(X_dummy)
    y_dummy_torch = torch.from_numpy(y_dummy)  # True classes

    # Loss function and optimizer

    optimizer = optim.SGD(model.parameters(), lr=0.1)
    criterion = nn.CrossEntropyLoss()

    logs = []
    # Train for a few epochs
    for epoch in range(10):
        # Forward pass
        logits = model(X_dummy_torch)
        loss = criterion(logits, y_dummy_torch)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        logs.append(loss.item())

    print(logs)
    return logs


def test():
    np.testing.assert_allclose(run(), run_torch())

test()

[2.301618242263794, 2.274173402786255, 2.247321891784668, 2.220949649810791, 2.195030164718628, 2.1695616245269775, 2.144471597671509, 2.1196786403656005, 2.0951594352722167, 2.0708685874938966]
[2.3016183376312256, 2.274173450469971, 2.2473219871520995, 2.2209496974945067, 2.1950303077697755, 2.1695616245269775, 2.1444715023040772, 2.1196785449981688, 2.0951594352722167, 2.0708685874938966]
