In [2]:
"""
Backpropagation demo (2-2-2 network) using:
- Inputs: x1=0.05, x2=0.10
- Targets: y1=0.01, y2=0.99
- Sigmoid activations (hidden + output)
- Loss: sum of 1/2 (y - yhat)^2 (MSE per output, summed)
- Learning rate alpha=0.5

This matches your slide setup (same initial weights/biases).
"""

from dataclasses import dataclass
from math import exp

def sigmoid(z: float) -> float:
    return 1.0 / (1.0 + exp(-z))

def sigmoid_prime_from_a(a: float) -> float:
    # derivative using activation: σ'(z) = a(1-a)
    return a * (1.0 - a)

@dataclass
class Params:
    # input -> hidden
    w1_11: float  # x1 -> h1
    w1_21: float  # x2 -> h1
    w1_12: float  # x1 -> h2
    w1_22: float  # x2 -> h2
    b1: float     # hidden bias (shared)

    # hidden -> output
    w2_11: float  # h1 -> o1
    w2_21: float  # h2 -> o1
    w2_12: float  # h1 -> o2
    w2_22: float  # h2 -> o2
    b2: float     # output bias (shared)

def forward(x1: float, x2: float, p: Params):
    # Hidden pre-activations
    z_h1 = x1 * p.w1_11 + x2 * p.w1_21 + p.b1
    z_h2 = x1 * p.w1_12 + x2 * p.w1_22 + p.b1
    # Hidden activations
    a_h1 = sigmoid(z_h1)
    a_h2 = sigmoid(z_h2)

    # Output pre-activations
    z_o1 = a_h1 * p.w2_11 + a_h2 * p.w2_21 + p.b2
    z_o2 = a_h1 * p.w2_12 + a_h2 * p.w2_22 + p.b2
    # Output activations (predictions)
    yhat1 = sigmoid(z_o1)
    yhat2 = sigmoid(z_o2)

    cache = {
        "z_h1": z_h1, "z_h2": z_h2, "a_h1": a_h1, "a_h2": a_h2,
        "z_o1": z_o1, "z_o2": z_o2, "yhat1": yhat1, "yhat2": yhat2,
        "x1": x1, "x2": x2
    }
    return yhat1, yhat2, cache

def loss_mse_sum(y1: float, y2: float, yhat1: float, yhat2: float) -> float:
    return 0.5 * (y1 - yhat1) ** 2 + 0.5 * (y2 - yhat2) ** 2

def backward(y1: float, y2: float, p: Params, cache):
    """
    Compute gradients for all params using chain rule.

    For output layer with MSE + sigmoid:
      dL/dyhat = (yhat - y)
      dyhat/dz = yhat(1-yhat)
      => delta_out = dL/dz = (yhat - y) * yhat(1-yhat)

    Then:
      dL/dw2_ij = a_hj * delta_out_i
      dL/db2    = delta_out_1 + delta_out_2  (since b2 is shared)
    Hidden deltas:
      delta_h = (W2 * delta_out) ⊙ a_h(1-a_h)
      where contribution to h1 is w2_11*delta1 + w2_12*delta2
            contribution to h2 is w2_21*delta1 + w2_22*delta2
    Then:
      dL/dw1 = x * delta_h
      dL/db1 = delta_h1 + delta_h2 (since b1 is shared)
    """
    x1, x2 = cache["x1"], cache["x2"]
    a_h1, a_h2 = cache["a_h1"], cache["a_h2"]
    yhat1, yhat2 = cache["yhat1"], cache["yhat2"]

    # Output deltas
    dL_dyhat1 = (yhat1 - y1)
    dL_dyhat2 = (yhat2 - y2)
    dyhat1_dz = sigmoid_prime_from_a(yhat1)
    dyhat2_dz = sigmoid_prime_from_a(yhat2)
    delta_o1 = dL_dyhat1 * dyhat1_dz
    delta_o2 = dL_dyhat2 * dyhat2_dz

    # Gradients: hidden -> output
    dL_dw2_11 = a_h1 * delta_o1
    dL_dw2_21 = a_h2 * delta_o1
    dL_dw2_12 = a_h1 * delta_o2
    dL_dw2_22 = a_h2 * delta_o2
    dL_db2    = delta_o1 + delta_o2  # shared bias

    # Hidden deltas
    da_h1 = (p.w2_11 * delta_o1) + (p.w2_12 * delta_o2)
    da_h2 = (p.w2_21 * delta_o1) + (p.w2_22 * delta_o2)
    delta_h1 = da_h1 * sigmoid_prime_from_a(a_h1)
    delta_h2 = da_h2 * sigmoid_prime_from_a(a_h2)

    # Gradients: input -> hidden
    dL_dw1_11 = x1 * delta_h1
    dL_dw1_21 = x2 * delta_h1
    dL_dw1_12 = x1 * delta_h2
    dL_dw1_22 = x2 * delta_h2
    dL_db1    = delta_h1 + delta_h2  # shared bias

    grads = {
        "w2_11": dL_dw2_11, "w2_21": dL_dw2_21, "w2_12": dL_dw2_12, "w2_22": dL_dw2_22,
        "b2": dL_db2,
        "w1_11": dL_dw1_11, "w1_21": dL_dw1_21, "w1_12": dL_dw1_12, "w1_22": dL_dw1_22,
        "b1": dL_db1,
        "delta_o1": delta_o1, "delta_o2": delta_o2,
        "delta_h1": delta_h1, "delta_h2": delta_h2,
    }
    return grads

def update_params(p: Params, grads: dict, alpha: float) -> Params:
    # Gradient descent step
    return Params(
        w1_11 = p.w1_11 - alpha * grads["w1_11"],
        w1_21 = p.w1_21 - alpha * grads["w1_21"],
        w1_12 = p.w1_12 - alpha * grads["w1_12"],
        w1_22 = p.w1_22 - alpha * grads["w1_22"],
        b1    = p.b1    - alpha * grads["b1"],
        w2_11 = p.w2_11 - alpha * grads["w2_11"],
        w2_21 = p.w2_21 - alpha * grads["w2_21"],
        w2_12 = p.w2_12 - alpha * grads["w2_12"],
        w2_22 = p.w2_22 - alpha * grads["w2_22"],
        b2    = p.b2    - alpha * grads["b2"],
    )

def run_one_step(x1, x2, y1, y2, alpha, p: Params, verbose=True):
    yhat1, yhat2, cache = forward(x1, x2, p)
    E = loss_mse_sum(y1, y2, yhat1, yhat2)
    grads = backward(y1, y2, p, cache)
    p_new = update_params(p, grads, alpha)

    if verbose:
        print("=== Forward ===")
        print(f"yhat1={yhat1:.12f}, yhat2={yhat2:.12f}")
        print(f"Total loss E={E:.12f}\n")

        print("=== Backward (key deltas) ===")
        print(f"delta_o1={grads['delta_o1']:.12f}, delta_o2={grads['delta_o2']:.12f}")
        print(f"delta_h1={grads['delta_h1']:.12f}, delta_h2={grads['delta_h2']:.12f}\n")

        print("=== Updated params ===")
        for k in ["w1_11","w1_12","w1_21","w1_22","b1","w2_11","w2_12","w2_21","w2_22","b2"]:
            old = getattr(p, k)
            new = getattr(p_new, k)
            print(f"{k}: {old:.12f} -> {new:.12f}")

    return p_new, E, (yhat1, yhat2), grads

if __name__ == "__main__":
    # Your data
    x1, x2 = 0.05, 0.10
    y1, y2 = 0.01, 0.99
    alpha = 0.5

    # Your initialization (as in the slides)
    p = Params(
        w1_11=0.15, w1_12=0.25,
        w1_21=0.20, w1_22=0.30,
        b1=0.35,
        w2_11=0.40, w2_12=0.50,
        w2_21=0.45, w2_22=0.55,
        b2=0.60
    )

    run_one_step(x1, x2, y1, y2, alpha, p, verbose=True)


=== Forward ===
yhat1=0.751365069552, yhat2=0.772928465321
Total loss E=0.298371108760

=== Backward (key deltas) ===
delta_o1=0.138498561629, delta_o2=-0.038098236517
delta_h1=0.008771354689, delta_h2=0.009954254705

=== Updated params ===
w1_11: 0.150000000000 -> 0.149780716133
w1_12: 0.250000000000 -> 0.249751143632
w1_21: 0.200000000000 -> 0.199561432266
w1_22: 0.300000000000 -> 0.299502287265
b1: 0.350000000000 -> 0.340637195303
w2_11: 0.400000000000 -> 0.358916479718
w2_12: 0.500000000000 -> 0.511301270239
w2_21: 0.450000000000 -> 0.408666186076
w2_22: 0.550000000000 -> 0.561370121108
b2: 0.600000000000 -> 0.549799837444
