In [1]:
import numpy as np

In [2]:
class Node:
    def __init__(self, value):
        self.value = np.array(value, dtype=np.float32)
        self.grad = np.zeros_like(self.value, np.float32)
        self.inputs = []

    def zero_grad(self):
        self.grad = np.zeros_like(self.value)
        for input, _ in self.inputs:
            input.zero_grad()

    def backward(self, upstream_grad=None):
        if upstream_grad is None:
            upstream_grad = np.ones_like(self.value, np.float32)

        self.grad = self.grad + upstream_grad
        for input, grad_fn in self.inputs:
            input_grad = grad_fn(upstream_grad)
            input.backward(input_grad)

    def __add__(self, other):
        return Add(self, other)

    def __sub__(self, other):
        return Subtract(self, other)

    def __mul__(self, other):
        return Multiply(self, other)

    def __matmul__(self, other):
        return MatMultiply(self, other)


class Add(Node):
    def __init__(self, a, b):
        super().__init__(a.value + b.value)
        self.inputs = [(a, lambda upstream: upstream), (b, lambda upstream: upstream)]


class Subtract(Node):
    def __init__(self, a, b):
        super().__init__(a.value - b.value)
        self.inputs = [(a, lambda upstream: upstream), (b, lambda upstream: upstream)]


class Multiply(Node):
    def __init__(self, a, b):
        super().__init__(a.value * b.value)
        self.inputs = [
            (a, lambda upstream: upstream * b.value),
            (b, lambda upstream: upstream * a.value),
        ]


class MatMultiply(Node):
    def __init__(self, a, b):
        super().__init__(a.value @ b.value)
        self.inputs = [
            (a, lambda upstream: upstream @ b.value.T),
            (b, lambda upstream: a.value.T @ upstream),
        ]


class ReLU(Node):
    def __init__(self, x):
        super().__init__(np.maximum(0, x.value))
        self.inputs = [
            (x, lambda upstream: upstream * (x.value > 0).astype(np.float32))
        ]

class Transpose(Node):
    def __init__(self, x):
        super().__init__(x.value.T)
        self.inputs = [
            (x, lambda upstream: upstream.T)
        ]


def mse_loss(y_pred, y_true):
    diff = y_pred - y_true
    return Transpose(diff) @ diff

In [30]:
n_samples = 100000

X = Node(np.random.randn(n_samples, 2))
y = Node((np.cos(X.value[:, 0]) + np.sin(X.value[:, 1]))[:, None])


hidden_layer_dim = 10
learning_rate = 0.0001
epochs = 10000
batch_size = 100

weights_1 = Node(np.random.randn(X.value.shape[1], hidden_layer_dim))
biases_1 = Node(np.random.randn(1, hidden_layer_dim))
weights_2 = Node(np.random.randn(hidden_layer_dim, 1))
biases_2 = Node(np.random.randn(1, 1))
for epoch in range(epochs):
    random_indices = np.random.choice(X.value.shape[0], batch_size, replace=False)
    out_1 = ReLU(Node(X.value[random_indices]) @ weights_1 + biases_1)
    out_2 = out_1 @ weights_2 + biases_2
    loss = mse_loss(out_2, Node(y.value[random_indices]))
    loss.zero_grad()
    loss.backward()
    print(f"Epoch {epoch}: {loss.value / batch_size}")
    
    weights_1.value -= learning_rate * weights_1.grad
    weights_2.value -= learning_rate * weights_2.grad
    biases_1.value -= learning_rate * biases_1.grad.sum(axis=0)
    biases_2.value -= learning_rate * biases_2.grad.sum(axis=0)


Epoch 0: [[7.922997]]
Epoch 1: [[6.577794]]
Epoch 2: [[5.2824397]]
Epoch 3: [[4.422083]]
Epoch 4: [[4.4929724]]
Epoch 5: [[3.208303]]
Epoch 6: [[2.354987]]
Epoch 7: [[2.291008]]
Epoch 8: [[2.0731483]]
Epoch 9: [[1.4412616]]
Epoch 10: [[1.4339746]]
Epoch 11: [[1.2813427]]
Epoch 12: [[1.7153064]]
Epoch 13: [[0.80591583]]
Epoch 14: [[1.2691098]]
Epoch 15: [[0.77408195]]
Epoch 16: [[0.9256427]]
Epoch 17: [[0.8804483]]
Epoch 18: [[0.74042434]]
Epoch 19: [[0.70030457]]
Epoch 20: [[0.76951677]]
Epoch 21: [[0.4906487]]
Epoch 22: [[0.69079095]]
Epoch 23: [[0.55185395]]
Epoch 24: [[0.6223375]]
Epoch 25: [[0.595105]]
Epoch 26: [[0.5026137]]
Epoch 27: [[0.5764327]]
Epoch 28: [[0.488371]]
Epoch 29: [[0.4465042]]
Epoch 30: [[0.45703]]
Epoch 31: [[0.4659858]]
Epoch 32: [[0.51703894]]
Epoch 33: [[0.39132103]]
Epoch 34: [[0.44427475]]
Epoch 35: [[0.43491477]]
Epoch 36: [[0.44502082]]
Epoch 37: [[0.42015207]]
Epoch 38: [[0.6556569]]
Epoch 39: [[0.37056968]]
Epoch 40: [[0.3876309]]
Epoch 41: [[0.44624025

In [33]:
test_x = Node(np.random.randn(5, 2))
out_1 = ReLU(test_x @ weights_1 + biases_1)
out_2 = out_1 @ weights_2 + biases_2
out_2.value


array([[1.6113757 ],
       [0.93700707],
       [0.7134111 ],
       [0.03433641],
       [1.0218027 ]], dtype=float32)

In [34]:
(np.cos(test_x.value[:,0]) + np.sin(test_x.value[:,1]))[:, None]

array([[1.5448935 ],
       [0.8595258 ],
       [0.70992374],
       [0.06929165],
       [0.9546307 ]], dtype=float32)