In [1]:
def sort_nodes(node, order=None, visited=None):
    # topological sort
    if order is None:
        order = []
    if visited is None:
        visited = set()
    
    if node not in visited:
        visited.add(node)
        for p in node.parents:
            sort_nodes(p, order=order, visited=visited)
        order.append(node)
    
    return order


class Node:
    def __init__(
        self,
        value=None,
        gradient=0,
        parents=None,
        children=None,
        label=None,
    ):
        if parents is None:
            self.parents = []
        else:
            self.parents = parents
        
        for p in self.parents:
            p.children.append(self)
        
        if children is None:
            self.children = []
        else:
            self.children = children
        self.gradient = 0
        self.value = None
        self.label = label
    
    def _forward(self):
        # modifies self.value using parent(s).value
        if self.value is None:
            raise ValueError("Node value not set")
    
    def _backward(self):
        # modifies parent(s).gradient using self.gradient
        if len(self.parents) > 0:
            raise ValueError("Node has parents but no _backward implementation")

    def __repr__(self):
        rep = f"{self.label}: " if self.label is not None else ""
        return rep + f"{type(self).__name__}(value={self.value}, gradient={self.gradient})"

    def forward(self):
        ordered_nodes = sort_nodes(self)
        for n in ordered_nodes:
            n._forward()

    def backward(self):
        ordered_nodes = sort_nodes(self)
        self.gradient = 1 # gradient with respect to self
        for n in reversed(ordered_nodes):
            n._backward()


In [2]:
from math import prod
import numpy as np


class Add(Node):
    def _forward(self):
        self.value = sum(p.value for p in self.parents)

    def _backward(self):
        # add propagates gradient backwards (dx/dx = 1)
        for p in self.parents:
            p.gradient += self.gradient


class Multiply(Node):
    def _forward(self):
        self.value = prod(p.value for p in self.parents)

    def _backward(self):
        # multiply propagates value of other backwards (d(xy)/dx = y)
        for p in self.parents:
            p.gradient += (self.value / p.value) * self.gradient


class Sigmoid(Node):
    def _forward(self):
        # sigmoid(x) = 1 / (1 + exp(-x))
        self.value = 1 / (1 + np.exp(-self.parents[0].value))

    def _backward(self):
        #  derivative of sigmoid is sigmoid(x) * (1 - sigmoid(x))
        for p in self.parents:
            p.gradient += (self.value * (1 - self.value)) * self.gradient


class LogLoss(Node):
    def _forward(self):
        # L = -y * np.log(yhat) - (1 - y) * np.log(1 - yhat)  # log loss
        self.value = -self.parents[0].value * np.log(self.parents[1].value) - (
            1 - self.parents[0].value
        ) * np.log(1 - self.parents[1].value)

    def _backward(self):
        # dL_dyhat = -y / yhat + (1 - y) / (1 - yhat)
        self.parents[1].gradient += (self.parents[0].value / self.parents[1].value) + (
            1 - self.parents[0].value
        ) / (1 - self.parents[1].value)

In [3]:
x1 = Node(label="x1")
x2 = Node(label="x2")

w1 = Node(label="w1")
w2 = Node(label="w2")

w1x1 = Multiply(parents=[w1, x1], label="w1x1")
w2x2 = Multiply(parents=[w2, x2], label="w2x2")

b = Node(label="b")

z = Add(parents=[w1x1, w2x2, b], label="z")

yhat = Sigmoid(parents=[z], label="yhat")

y = Node(label="y")

L = LogLoss(parents=[y, yhat], label="L")

In [4]:
x1.value = 1.5
x2.value = 2.5
w1.value = -4
w2.value = 3
b.value = -1.0
y.value = 1


In [8]:
print(x1)
print(x2)
print(w1)
print(w2)
print(w1x1)
print(w2x2)
print(b)
print(z)
print(yhat)
print(y)
print(L)

x1: Node(value=1.5, gradient=-1.5101626751925818)
x2: Node(value=2.5, gradient=1.1326220063944363)
w1: Node(value=-4, gradient=0.5663110031972182)
w2: Node(value=3, gradient=0.9438516719953637)
w1x1: Multiply(value=-6.0, gradient=0.37754066879814546)
w2x2: Multiply(value=7.5, gradient=0.37754066879814546)
b: Node(value=-1.0, gradient=0.37754066879814546)
z: Add(value=0.5, gradient=0.37754066879814546)
yhat: Sigmoid(value=0.6224593312018546, gradient=1.6065306597126334)
y: Node(value=1, gradient=0)
L: LogLoss(value=0.47407698418010663, gradient=1)


In [6]:
L.forward()

In [7]:
L.backward()