# Deep dive into backpropagation

In [1]:
import math
import matplotlib.pyplot as plt

# https://github.com/jeremiedecock/neural-network-figures.git
import nnfigs.core as nnfig
import numpy as np

import torch

$
\renewcommand{\cur}{i}
\renewcommand{\prev}{j}
\renewcommand{\prevcur}{{\cur\prev}}
\renewcommand{\next}{k}
\renewcommand{\curnext}{{\next\cur}}
\renewcommand{\ex}{\eta}
\renewcommand{\pot}{\sigma}
\renewcommand{\feature}{x}
\renewcommand{\weight}{{\boldsymbol{w}}}
\renewcommand{\wcur}{{\weight_{\cur\prev}}}
\renewcommand{\activthres}{\theta}
\renewcommand{\activfunc}{f}
\renewcommand{\errfunc}{E}
\renewcommand{\learnrate}{\epsilon}
\renewcommand{\learnit}{n}
\renewcommand{\sigout}{{\boldsymbol{y}}}
\renewcommand{\sigoutdes}{{\boldsymbol{y^*}}}
\renewcommand{\weights}{\boldsymbol{W}}
\renewcommand{\errsig}{\Delta}
$

Notations:

- $\cur$: couche courante
- $\prev$: couche immédiatement en amont de la courche courrante (i.e. vers la couche d'entrée du réseau)
- $\next$: couche immédiatement en aval de la courche courrante (i.e. vers la couche de sortie du réseau)
- $\ex$: exemple (*sample* ou *feature*) courant (i.e. le vecteur des entrées courantes du réseau)
- $\pot_\cur$: *Potentiel d'activation* du neurone $i$ pour l'exemple courant
- $\wcur$: Poids de la connexion entre le neurone $j$ et le neurone $i$
- $\activthres_\cur$: *Seuil d'activation* du neurone $i$
- $\activfunc_\cur$: *Fonction d'activation* du neurone $i$
- $\errfunc$: *Fonction objectif* ou *fonction d'erreur*
- $\learnrate$: *Pas d'apprentissage* ou *Taux d'apprentissage*
- $\learnit$: Numéro d'itération (ou cycle ou époque) du processus d'apprentissage
- $\sigout_\cur$: Signal de sortie du neurone $i$ pour l'exemple courant
- $\sigoutdes_\cur$: Sortie désirée (*étiquette*) du neurone $i$ pour l'exemple courant
- $\weights$: Matrice des poids du réseau (en réalité il y a une matrice de taille potentiellement différente par couche)
- $\errsig_i$: *Signal d'erreur* du neurone $i$ pour l'exemple courant

In [2]:
STR_CUR = r"i"       # Couche courante
STR_PREV = r"j"      # Couche immédiatement en amont de la courche courrante (i.e. vers la couche d'entrée du réseau)
STR_NEXT = r"k"      # Couche immédiatement en aval de la courche courrante (i.e. vers la couche de sortie du réseau)
STR_EX = r"\eta"     # Exemple (*sample* ou *feature*) courant (i.e. le vecteur des entrées courantes du réseau)
STR_POT = r"\sigma"       # *Potentiel d'activation* du neurone $i$ pour l'exemple $\ex$
STR_POT_CUR = r"x_i"       # *Potentiel d'activation* du neurone $i$ pour l'exemple $\ex$
STR_WEIGHT = r"w"
STR_WEIGHT_CUR = r"w_{ij}"  # Poids de la connexion entre le neurone $j$ et le neurone $i$
STR_ACTIVTHRES = r"\theta"  # *Seuil d'activation* du neurone $i$
STR_ACTIVFUNC = r"f"        # *Fonction d'activation* du neurone $i$
STR_ERRFUNC = r"E"          # *Fonction objectif* ou *fonction d'erreur*
STR_LEARNRATE = r"\epsilon" # *Pas d'apprentissage* ou *Taux d'apprentissage*
STR_LEARNIT = r"n"          # Numéro d'itération (ou cycle ou époque) du processus d'apprentissage
STR_SIGIN = r"x"            # Signal de sortie du neurone $i$ pour l'exemple $\ex$
STR_SIGOUT = r"y"           # Signal de sortie du neurone $i$ pour l'exemple $\ex$
STR_SIGOUT_CUR = r"y_i"
STR_SIGOUT_PREV = r"y_j"
STR_SIGOUT_DES = r"d"           # Sortie désirée (*étiquette*) du neurone $i$ pour l'exemple $\ex$
STR_SIGOUT_DES_CUR = r"d_i"
STR_WEIGHTS = r"W"              # Matrice des poids du réseau (en réalité il y a une matrice de taille potentiellement différente par couche)
STR_ERRSIG = r"\Delta"          # *Signal d'erreur* du neurone $i$ pour l'exemple $\ex$

def tex(tex_str):
    return r"$" + tex_str + r"$"

## Make the model in PyTorch

**Remark**: The model is a simple feedforward neural network with two hidden layers. To simplify computations, we don't use any bias.

In [None]:
model = torch.nn.Sequential(
    torch.nn.Linear(2, 2, bias=False),
    torch.nn.Tanh(),
    torch.nn.Linear(2, 2, bias=False),
    torch.nn.Tanh(),
    torch.nn.Linear(2, 1, bias=False)
)
model.state_dict() # print the weights and biases of the model

In [None]:
x = torch.randn(2)
x

In [None]:
y_true = torch.randn(1)
y_true

In [6]:
loss_fn = torch.nn.MSELoss()

## Compute the forward pass

In [None]:
fig, ax = nnfig.init_figure(size_x=8, size_y=4)

HSPACE = 6
VSPACE = 4

# Synapse #####################################

# Layer 1-2
nnfig.draw_synapse(ax, (0,  VSPACE), (HSPACE,  VSPACE), label=tex(STR_WEIGHT + "_1"), label_position=0.4)
nnfig.draw_synapse(ax, (0, -VSPACE), (HSPACE,  VSPACE), label=tex(STR_WEIGHT + "_3"), label_position=0.25, label_offset_y=-0.8)

nnfig.draw_synapse(ax, (0,  VSPACE), (HSPACE, -VSPACE), label=tex(STR_WEIGHT + "_2"), label_position=0.25)
nnfig.draw_synapse(ax, (0, -VSPACE), (HSPACE, -VSPACE), label=tex(STR_WEIGHT + "_4"), label_position=0.4, label_offset_y=-0.8)

# Layer 2-3
nnfig.draw_synapse(ax, (HSPACE,  VSPACE), (2*HSPACE,  VSPACE), label=tex(STR_WEIGHT + "_5"), label_position=0.4)
nnfig.draw_synapse(ax, (HSPACE, -VSPACE), (2*HSPACE,  VSPACE), label=tex(STR_WEIGHT + "_7"), label_position=0.25, label_offset_y=-0.8)

nnfig.draw_synapse(ax, (HSPACE,  VSPACE), (2*HSPACE, -VSPACE), label=tex(STR_WEIGHT + "_6"), label_position=0.25)
nnfig.draw_synapse(ax, (HSPACE, -VSPACE), (2*HSPACE, -VSPACE), label=tex(STR_WEIGHT + "_8"), label_position=0.4, label_offset_y=-0.8)

# Layer 3-4
nnfig.draw_synapse(ax, (2*HSPACE,  VSPACE), (3*HSPACE, 0), label=tex(STR_WEIGHT + "_9"), label_position=0.4)
nnfig.draw_synapse(ax, (2*HSPACE, -VSPACE), (3*HSPACE, 0), label=tex(STR_WEIGHT + "_{10}"), label_position=0.4, label_offset_y=-0.8)

nnfig.draw_synapse(ax, (3*HSPACE, 0), (3*HSPACE + 2, 0))

# Neuron ######################################

# Layer 1 (input)
nnfig.draw_neuron(ax, (0,  VSPACE), 0.5, empty=True)
nnfig.draw_neuron(ax, (0, -VSPACE), 0.5, empty=True)

# Layer 2
nnfig.draw_neuron(ax, (HSPACE,  VSPACE), 1, ag_func="sum", tr_func="sigmoid")
nnfig.draw_neuron(ax, (HSPACE, -VSPACE), 1, ag_func="sum", tr_func="sigmoid")

# Layer 3
nnfig.draw_neuron(ax, (2*HSPACE,  VSPACE), 1, ag_func="sum", tr_func="sigmoid")
nnfig.draw_neuron(ax, (2*HSPACE, -VSPACE), 1, ag_func="sum", tr_func="sigmoid")

# Layer 4
nnfig.draw_neuron(ax, (3*HSPACE, 0), 1, ag_func="sum", tr_func="identity")

# Text ########################################

# Layer 1 (input)
#plt.text(x=0.5, y=VSPACE+1, s=tex(STR_SIGOUT + "_i"), fontsize=12)
plt.text(x=-1.7, y=VSPACE,      s=tex(STR_SIGIN + "_1"), fontsize=12)
plt.text(x=-1.7, y=-VSPACE-0.2, s=tex(STR_SIGIN + "_2"), fontsize=12)

# Layer 2
#plt.text(x=HSPACE-1.25, y=VSPACE+1.5, s=tex(STR_POT + "_1"), fontsize=12)
plt.text(x=HSPACE+0.4,  y=VSPACE+1.5, s=tex(STR_SIGOUT + "_1"), fontsize=12)

#plt.text(x=HSPACE-1.25, y=-VSPACE-1.8, s=tex(STR_POT + "_2"), fontsize=12)
plt.text(x=HSPACE+0.4,  y=-VSPACE-1.8, s=tex(STR_SIGOUT + "_2"), fontsize=12)

# Layer 3
#plt.text(x=2*HSPACE-1.25, y=VSPACE+1.5, s=tex(STR_POT + "_3"), fontsize=12)
plt.text(x=2*HSPACE+0.4,  y=VSPACE+1.5, s=tex(STR_SIGOUT + "_3"), fontsize=12)

#plt.text(x=2*HSPACE-1.25, y=-VSPACE-1.8, s=tex(STR_POT + "_4"), fontsize=12)
plt.text(x=2*HSPACE+0.4,  y=-VSPACE-1.8, s=tex(STR_SIGOUT + "_4"), fontsize=12)

# Layer 4
#plt.text(x=3*HSPACE-1.25, y=1.5, s=tex(STR_POT + "_o"), fontsize=12)
#plt.text(x=3*HSPACE+0.4,  y=1.5, s=tex(STR_SIGOUT + "_o"), fontsize=12)

plt.text(x=3*HSPACE+2.5,  y=-0.3,
         s=tex(STR_SIGOUT),
         fontsize=12)

plt.show()

$
\renewcommand{\yone}{\underbrace{\activfunc \left(\weight_1 \feature_1 + \weight_3 \feature_2 \right)}_{\sigout_1}}
\renewcommand{\ytwo}{\underbrace{\activfunc \left(\weight_2 \feature_1 + \weight_4 \feature_2 \right)}_{\sigout_2}}
\renewcommand{\ythree}{\underbrace{\activfunc \left(\weight_5 \yone + \weight_7 \ytwo \right)}_{\sigout_3}}
\renewcommand{\yfour}{\underbrace{\activfunc \left(\weight_6 \yone + \weight_8 \ytwo \right)}_{\sigout_4}}
$

$$
\sigout =
\activfunc \left(
\weight_9 ~ \ythree
+
\weight_{10} ~ \yfour
\right)
$$

In [8]:
x1 = x[0].item()
x2 = x[1].item()

w1 = model[0].weight[0, 0].item()
w2 = model[0].weight[1, 0].item()
w3 = model[0].weight[0, 1].item()
w4 = model[0].weight[1, 1].item()

w5 = model[2].weight[0, 0].item()
w6 = model[2].weight[1, 0].item()
w7 = model[2].weight[0, 1].item()
w8 = model[2].weight[1, 1].item()

w9 = model[4].weight[0, 0].item()
w10 = model[4].weight[0, 1].item()

# f = torch.nn.functional.tanh
f = math.tanh

def df(x):
    """Derivative of the tanh function
    $\tanh '= \frac{1}{\cosh^{2}} = 1-\tanh^{2}$
    """
    y = 1. - math.tanh(x) ** 2
    return y

In [None]:
sigma1 = w1 * x1 + w3 * x2  # (x @ model[0].weight)[0].item()
y1 = f(sigma1)              # torch.nn.functional.tanh(x @ model[0].weight)[0]

sigma2 = w2 * x1 + w4 * x2  # (x @ model[0].weight)[1]
y2 = f(sigma2)              # torch.nn.functional.tanh(x @ model[0].weight)[1]

sigma3 = w5 * y1 + w7 * y2  # (torch.nn.functional.tanh(x @ model[0].weight) @ model[2].weight)[0]
y3 = f(sigma3)              # torch.nn.functional.tanh(torch.nn.functional.tanh(x @ model[0].weight) @ model[2].weight)[0].item()

sigma4 = w6 * y1 + w8 * y2  # (torch.nn.functional.tanh(x @ model[0].weight) @ model[2].weight)[1].item()
y4 = f(sigma4)              # torch.nn.functional.tanh(torch.nn.functional.tanh(x @ model[0].weight) @ model[2].weight)[1].item()

sigma = w9 * y3 + w10 * y4
y_pred = sigma

y_pred

In [10]:
# # Check
# print(sigma1, (x @ model[0].weight)[0].item())
# print(y1, torch.nn.functional.tanh(x @ model[0].weight)[0].item())
# print(sigma2, (x @ model[0].weight)[1].item())
# print(y2, torch.nn.functional.tanh(x @ model[0].weight)[1].item())

# print(sigma3, (torch.nn.functional.tanh(x @ model[0].weight) @ model[2].weight)[0].item())
# print(y3, torch.nn.functional.tanh(torch.nn.functional.tanh(x @ model[0].weight) @ model[2].weight)[0].item())
# print(sigma4, (torch.nn.functional.tanh(x @ model[0].weight) @ model[2].weight)[1].item())
# print(y4, torch.nn.functional.tanh(torch.nn.functional.tanh(x @ model[0].weight) @ model[2].weight)[1].item())

# print(sigma, (torch.nn.functional.tanh(torch.nn.functional.tanh(x @ model[0].weight) @ model[2].weight) @ model[4].weight.T)[0].item())
# print(y_pred, torch.nn.functional.tanh(torch.nn.functional.tanh(x @ model[0].weight) @ model[2].weight) @ model[4].weight.T)

In [None]:
# The loss is the mean squared error between the predicted and true values
(y_pred - y_true)**2

### Let's check with PyTorch

In [12]:
# torch.nn.functional.tanh(
#     torch.nn.functional.tanh(
#         x @ model[0].weight
#     ) @ model[2].weight
# ) @ model[4].weight.T

In [13]:
# torch.nn.functional.tanh(
#     torch.nn.functional.tanh(
#         x @ model[0].weight.T
#     ) @ model[2].weight.T
# ) @ model[4].weight.T

In [14]:
# model[4].weight @ torch.nn.functional.tanh(
#     model[2].weight @
#     torch.nn.functional.tanh(
#         model[0].weight @ x
#     )
# )

In [None]:
y_pred = model(x)
y_pred

In [None]:
error = loss_fn(y_pred, y_true)
error

## Compute the backward pass

In [None]:
fig, ax = nnfig.init_figure(size_x=8, size_y=4)

HSPACE = 6
VSPACE = 4

# Synapse #####################################

# Layer 1-2
nnfig.draw_synapse(ax, (0,  VSPACE), (HSPACE,  VSPACE), label=tex(STR_WEIGHT + "_1"), label_position=0.4)
nnfig.draw_synapse(ax, (0, -VSPACE), (HSPACE,  VSPACE), color="lightgray")

nnfig.draw_synapse(ax, (0,  VSPACE), (HSPACE, -VSPACE), color="lightgray")
nnfig.draw_synapse(ax, (0, -VSPACE), (HSPACE, -VSPACE), color="lightgray")

# Layer 2-3
nnfig.draw_synapse(ax, (HSPACE,  VSPACE), (2*HSPACE,  VSPACE), label=tex(STR_WEIGHT + "_2"), label_position=0.4)
nnfig.draw_synapse(ax, (HSPACE, -VSPACE), (2*HSPACE,  VSPACE), color="lightgray")

nnfig.draw_synapse(ax, (HSPACE,  VSPACE), (2*HSPACE, -VSPACE), label=tex(STR_WEIGHT + "_3"), label_position=0.4)
nnfig.draw_synapse(ax, (HSPACE, -VSPACE), (2*HSPACE, -VSPACE), color="lightgray")

# Layer 3-4
nnfig.draw_synapse(ax, (2*HSPACE,  VSPACE), (3*HSPACE, 0), label=tex(STR_WEIGHT + "_4"), label_position=0.4)
nnfig.draw_synapse(ax, (2*HSPACE, -VSPACE), (3*HSPACE, 0), label=tex(STR_WEIGHT + "_5"), label_position=0.4, label_offset_y=-0.8)

# Neuron ######################################

# Layer 1 (input)
nnfig.draw_neuron(ax, (0,  VSPACE), 0.5, empty=True)
nnfig.draw_neuron(ax, (0, -VSPACE), 0.5, empty=True, line_color="lightgray")

# Layer 2
nnfig.draw_neuron(ax, (HSPACE,  VSPACE), 1, ag_func="sum", tr_func="sigmoid")
nnfig.draw_neuron(ax, (HSPACE, -VSPACE), 1, ag_func="sum", tr_func="sigmoid", line_color="lightgray")

# Layer 3
nnfig.draw_neuron(ax, (2*HSPACE,  VSPACE), 1, ag_func="sum", tr_func="sigmoid")
nnfig.draw_neuron(ax, (2*HSPACE, -VSPACE), 1, ag_func="sum", tr_func="sigmoid")

# Layer 4
nnfig.draw_neuron(ax, (3*HSPACE, 0), 1, ag_func="sum", tr_func="sigmoid")

# Text ########################################

# Layer 1 (input)
plt.text(x=0.5, y=VSPACE+1, s=tex(STR_SIGOUT + "_i"), fontsize=12)

# Layer 2
plt.text(x=HSPACE-1.25, y=VSPACE+1.5, s=tex(STR_POT + "_1"), fontsize=12)
plt.text(x=HSPACE+0.4,  y=VSPACE+1.5, s=tex(STR_SIGOUT + "_1"), fontsize=12)

# Layer 3
plt.text(x=2*HSPACE-1.25, y=VSPACE+1.5, s=tex(STR_POT + "_2"), fontsize=12)
plt.text(x=2*HSPACE+0.4,  y=VSPACE+1.5, s=tex(STR_SIGOUT + "_2"), fontsize=12)

plt.text(x=2*HSPACE-1.25, y=-VSPACE-1.8, s=tex(STR_POT + "_3"), fontsize=12)
plt.text(x=2*HSPACE+0.4,  y=-VSPACE-1.8, s=tex(STR_SIGOUT + "_3"), fontsize=12)

# Layer 4
plt.text(x=3*HSPACE-1.25, y=1.5, s=tex(STR_POT + "_o"), fontsize=12)
plt.text(x=3*HSPACE+0.4,  y=1.5, s=tex(STR_SIGOUT + "_o"), fontsize=12)

plt.text(x=3*HSPACE+2,  y=-0.3,
         s=tex(STR_ERRFUNC + " = (" + STR_SIGOUT + "_o - " + STR_SIGOUT_DES + "_o)^2/2"),
         fontsize=12)

plt.show()

In [18]:
error.backward()

In [None]:
# Get gradients
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Gradient for {name}: {param.grad}")

### Forward computation of $\frac{\partial \errfunc}{\partial \weight_{10}}$

In [None]:
fig, ax = nnfig.init_figure(size_x=8, size_y=4)

HSPACE = 6
VSPACE = 4

# Synapse #####################################

# Layer 1-2
nnfig.draw_synapse(
    ax, (0,  VSPACE), (HSPACE,  VSPACE),
    # label=tex(STR_WEIGHT + "_1"), label_position=0.4,
    color="lightgray"
)
nnfig.draw_synapse(
    ax, (0, -VSPACE), (HSPACE,  VSPACE),
    # label=tex(STR_WEIGHT + "_3"), label_position=0.25, label_offset_y=-0.8,
    color="lightgray"
)

nnfig.draw_synapse(
    ax, (0,  VSPACE), (HSPACE, -VSPACE),
    # label=tex(STR_WEIGHT + "_2"), label_position=0.25,
    color="lightgray"
)
nnfig.draw_synapse(
    ax, (0, -VSPACE), (HSPACE, -VSPACE),
    # label=tex(STR_WEIGHT + "_4"), label_position=0.4, label_offset_y=-0.8,
    color="lightgray"
)

# Layer 2-3
nnfig.draw_synapse(
    ax, (HSPACE,  VSPACE), (2*HSPACE,  VSPACE),
    # label=tex(STR_WEIGHT + "_5"), label_position=0.4,
    color="lightgray"
)
nnfig.draw_synapse(
    ax, (HSPACE, -VSPACE), (2*HSPACE,  VSPACE),
    # label=tex(STR_WEIGHT + "_7"), label_position=0.25, label_offset_y=-0.8,
    color="lightgray"
)

nnfig.draw_synapse(
    ax, (HSPACE,  VSPACE), (2*HSPACE, -VSPACE),
    # label=tex(STR_WEIGHT + "_6"), label_position=0.25,
    color="lightgray"
)
nnfig.draw_synapse(
    ax, (HSPACE, -VSPACE), (2*HSPACE, -VSPACE),
    # label=tex(STR_WEIGHT + "_8"), label_position=0.4, label_offset_y=-0.8,
    color="lightgray"
)

# Layer 3-4
nnfig.draw_synapse(
    ax, (2*HSPACE,  VSPACE), (3*HSPACE, 0),
    # label=tex(STR_WEIGHT + "_9"), label_position=0.4,
    color="lightgray"
)
nnfig.draw_synapse(
    ax, (2*HSPACE, -VSPACE), (3*HSPACE, 0),
    label=tex(STR_WEIGHT + "_{10}"), label_position=0.4, label_offset_y=-0.8
)

nnfig.draw_synapse(ax, (3*HSPACE, 0), (3*HSPACE + 2, 0))

# Neuron ######################################

# Layer 1 (input)
nnfig.draw_neuron(ax, (0,  VSPACE), 0.5, empty=True, line_color="lightgray")
nnfig.draw_neuron(ax, (0, -VSPACE), 0.5, empty=True, line_color="lightgray")

# Layer 2
nnfig.draw_neuron(ax, (HSPACE,  VSPACE), 1, ag_func="sum", tr_func="sigmoid", line_color="lightgray")
nnfig.draw_neuron(ax, (HSPACE, -VSPACE), 1, ag_func="sum", tr_func="sigmoid", line_color="lightgray")

# Layer 3
nnfig.draw_neuron(ax, (2*HSPACE,  VSPACE), 1, ag_func="sum", tr_func="sigmoid", line_color="lightgray")
nnfig.draw_neuron(ax, (2*HSPACE, -VSPACE), 1, ag_func="sum", tr_func="sigmoid", line_color="lightgray")

# Layer 4
nnfig.draw_neuron(ax, (3*HSPACE, 0), 1, ag_func="sum", tr_func="identity")

# Text ########################################

# Layer 1 (input)
# plt.text(x=0.5, y=VSPACE+1, s=tex(STR_SIGOUT + "_i"), fontsize=12)
# plt.text(x=-1.7, y=VSPACE,      s=tex(STR_SIGIN + "_1"), fontsize=12)
# plt.text(x=-1.7, y=-VSPACE-0.2, s=tex(STR_SIGIN + "_2"), fontsize=12)

# Layer 2
# plt.text(x=HSPACE-1.25, y=VSPACE+1.5, s=tex(STR_POT + "_1"), fontsize=12)
# plt.text(x=HSPACE+0.4,  y=VSPACE+1.5, s=tex(STR_SIGOUT + "_1"), fontsize=12)

# plt.text(x=HSPACE-1.25, y=-VSPACE-1.8, s=tex(STR_POT + "_2"), fontsize=12)
# plt.text(x=HSPACE+0.4,  y=-VSPACE-1.8, s=tex(STR_SIGOUT + "_2"), fontsize=12)

# Layer 3
# plt.text(x=2*HSPACE-1.25, y=VSPACE+1.5, s=tex(STR_POT + "_3"), fontsize=12)
# plt.text(x=2*HSPACE+0.4,  y=VSPACE+1.5, s=tex(STR_SIGOUT + "_3"), fontsize=12)

# plt.text(x=2*HSPACE-1.25, y=-VSPACE-1.8, s=tex(STR_POT + "_4"), fontsize=12)
# plt.text(x=2*HSPACE-0.2,  y=-VSPACE-1.8, s=tex(STR_POT + "_4"), fontsize=12, color="green")
plt.text(x=2*HSPACE+1.,  y=-VSPACE-1., s=tex(STR_SIGOUT + "_4"), fontsize=12)

# Layer 4
# plt.text(x=3*HSPACE-1.25, y=1.5, s=tex(STR_POT + "_o"), fontsize=12)
# plt.text(x=3*HSPACE+0.4,  y=1.5, s=tex(STR_SIGOUT + "_o"), fontsize=12)

plt.text(x=3*HSPACE-0.3,  y=-1.8, s=tex(STR_POT), fontsize=12, color="green")
plt.text(x=3*HSPACE+2.5,  y=-0.3, s=tex(STR_SIGOUT), fontsize=12, color="red")

plt.show()

#### General case

Using the chain rule:

$$
\frac{\partial \errfunc}{\partial \weight_{10}} =
\frac{\partial \errfunc}{\partial \color{red}{\sigout}}
\frac{\partial \color{red}{\sigout}}{\partial \color{green}{\pot}} ~
\frac{\partial \color{green}{\pot}}{\partial \weight_{10}} ~
$$

knowing that:

$$
\begin{align}
\frac{\partial \errfunc}{\partial \color{red}{\sigout}}              &= 2 (\sigout - \sigoutdes) \\
\frac{\partial \color{red}{\sigout}}{\partial \color{green}{\pot}}   &= f'(\pot) \\
\frac{\partial \color{green}{\pot}}{\partial \weight_{10}}           &= \sigout_4 \\
\end{align}
$$

we can write:

$$
\frac{\partial \errfunc}{\partial \weight_{10}} = 2(\sigout - \sigoutdes) \cdot f'(\pot) \cdot \sigout_4
$$

#### Naive detailed computation

Let's write the forward computation in a (naive) detailed way.

In [None]:
grad_E_w10 = 2 * (y_pred - y_true) * y4
grad_E_w10

#### Algebraic computation

Let's rewrite the forward computation in a less naive way (using linear algebra).

...

In [23]:
# f = torch.nn.functional.tanh

# h1 = f(x[0] @ model.weight_ih_l0 + h0 @ model.weight_hh_l0)   # hidden state at time step 1
# h2 = f(x[1] @ model.weight_ih_l0 + h1 @ model.weight_hh_l0)   # hidden state at time step 2
# h3 = f(x[2] @ model.weight_ih_l0 + h2 @ model.weight_hh_l0)   # hidden state at time step 3

# print(f"Output for time step 1:\nh1 = \n{ h1 }\n\n")
# print(f"Output for time step 2:\nh2 = \n{ h2 }\n\n")
# print(f"Output for time step 3:\nh3 = \n{ h3 }\n\n")

### Forward computation of $\frac{\partial \errfunc}{\partial \weight_{9}}$

In [None]:
fig, ax = nnfig.init_figure(size_x=8, size_y=4)

HSPACE = 6
VSPACE = 4

# Synapse #####################################

# Layer 1-2
nnfig.draw_synapse(
    ax, (0,  VSPACE), (HSPACE,  VSPACE),
    # label=tex(STR_WEIGHT + "_1"), label_position=0.4,
    color="lightgray"
)
nnfig.draw_synapse(
    ax, (0, -VSPACE), (HSPACE,  VSPACE),
    # label=tex(STR_WEIGHT + "_3"), label_position=0.25, label_offset_y=-0.8,
    color="lightgray"
)

nnfig.draw_synapse(
    ax, (0,  VSPACE), (HSPACE, -VSPACE),
    # label=tex(STR_WEIGHT + "_2"), label_position=0.25,
    color="lightgray"
)
nnfig.draw_synapse(
    ax, (0, -VSPACE), (HSPACE, -VSPACE),
    # label=tex(STR_WEIGHT + "_4"), label_position=0.4, label_offset_y=-0.8,
    color="lightgray"
)

# Layer 2-3
nnfig.draw_synapse(
    ax, (HSPACE,  VSPACE), (2*HSPACE,  VSPACE),
    # label=tex(STR_WEIGHT + "_5"), label_position=0.4,
    color="lightgray"
)
nnfig.draw_synapse(
    ax, (HSPACE, -VSPACE), (2*HSPACE,  VSPACE),
    # label=tex(STR_WEIGHT + "_7"), label_position=0.25, label_offset_y=-0.8,
    color="lightgray"
)

nnfig.draw_synapse(
    ax, (HSPACE,  VSPACE), (2*HSPACE, -VSPACE),
    # label=tex(STR_WEIGHT + "_6"), label_position=0.25,
    color="lightgray"
)
nnfig.draw_synapse(
    ax, (HSPACE, -VSPACE), (2*HSPACE, -VSPACE),
    # label=tex(STR_WEIGHT + "_8"), label_position=0.4, label_offset_y=-0.8,
    color="lightgray"
)

# Layer 3-4
nnfig.draw_synapse(
    ax, (2*HSPACE,  VSPACE), (3*HSPACE, 0),
    label=tex(STR_WEIGHT + "_9"), label_position=0.4
)
nnfig.draw_synapse(
    ax, (2*HSPACE, -VSPACE), (3*HSPACE, 0),
    # label=tex(STR_WEIGHT + "_{10}"), label_position=0.4, label_offset_y=-0.8,
    color="lightgray"
)

nnfig.draw_synapse(ax, (3*HSPACE, 0), (3*HSPACE + 2, 0))

# Neuron ######################################

# Layer 1 (input)
nnfig.draw_neuron(ax, (0,  VSPACE), 0.5, empty=True, line_color="lightgray")
nnfig.draw_neuron(ax, (0, -VSPACE), 0.5, empty=True, line_color="lightgray")

# Layer 2
nnfig.draw_neuron(ax, (HSPACE,  VSPACE), 1, ag_func="sum", tr_func="sigmoid", line_color="lightgray")
nnfig.draw_neuron(ax, (HSPACE, -VSPACE), 1, ag_func="sum", tr_func="sigmoid", line_color="lightgray")

# Layer 3
nnfig.draw_neuron(ax, (2*HSPACE,  VSPACE), 1, ag_func="sum", tr_func="sigmoid", line_color="lightgray")
nnfig.draw_neuron(ax, (2*HSPACE, -VSPACE), 1, ag_func="sum", tr_func="sigmoid", line_color="lightgray")

# Layer 4
nnfig.draw_neuron(ax, (3*HSPACE, 0), 1, ag_func="sum", tr_func="identity")

# Text ########################################

# Layer 1 (input)
# plt.text(x=0.5, y=VSPACE+1, s=tex(STR_SIGOUT + "_i"), fontsize=12)
# plt.text(x=-1.7, y=VSPACE,      s=tex(STR_SIGIN + "_1"), fontsize=12)
# plt.text(x=-1.7, y=-VSPACE-0.2, s=tex(STR_SIGIN + "_2"), fontsize=12)

# Layer 2
# plt.text(x=HSPACE-1.25, y=VSPACE+1.5, s=tex(STR_POT + "_1"), fontsize=12)
# plt.text(x=HSPACE+0.4,  y=VSPACE+1.5, s=tex(STR_SIGOUT + "_1"), fontsize=12)

# plt.text(x=HSPACE-1.25, y=-VSPACE-1.8, s=tex(STR_POT + "_2"), fontsize=12)
# plt.text(x=HSPACE+0.4,  y=-VSPACE-1.8, s=tex(STR_SIGOUT + "_2"), fontsize=12)

# Layer 3
# plt.text(x=2*HSPACE-1.25, y=VSPACE+1.5, s=tex(STR_POT + "_3"), fontsize=12)
plt.text(x=2*HSPACE+0.4,  y=VSPACE+1.5, s=tex(STR_SIGOUT + "_3"), fontsize=12)

# plt.text(x=2*HSPACE-1.25, y=-VSPACE-1.8, s=tex(STR_POT + "_4"), fontsize=12)
# plt.text(x=2*HSPACE-0.2,  y=-VSPACE-1.8, s=tex(STR_POT + "_4"), fontsize=12, color="green")
# plt.text(x=2*HSPACE+1.,  y=-VSPACE-1., s=tex(STR_SIGOUT + "_4"), fontsize=12)

# Layer 4
# plt.text(x=3*HSPACE-1.25, y=1.5, s=tex(STR_POT + "_o"), fontsize=12)
# plt.text(x=3*HSPACE+0.4,  y=1.5, s=tex(STR_SIGOUT + "_o"), fontsize=12)

plt.text(x=3*HSPACE-0.3,  y=-1.8, s=tex(STR_POT), fontsize=12, color="green")
plt.text(x=3*HSPACE+2.5,  y=-0.3, s=tex(STR_SIGOUT), fontsize=12, color="red")

plt.show()

#### General case

Using the chain rule:

$$
\frac{\partial \errfunc}{\partial \weight_{9}} =
\frac{\partial \errfunc}{\partial \color{red}{\sigout}}
\frac{\partial \color{red}{\sigout}}{\partial \color{green}{\pot}} ~
\frac{\partial \color{green}{\pot}}{\partial \weight_{9}} ~
$$

knowing that:

$$
\begin{align}
\frac{\partial \errfunc}{\partial \color{red}{\sigout}}              &= 2 (\sigout - \sigoutdes) \\
\frac{\partial \color{red}{\sigout}}{\partial \color{green}{\pot}}   &= f'(\pot) \\
\frac{\partial \color{green}{\pot}}{\partial \weight_{9}}           &= \sigout_3 \\
\end{align}
$$

we can write:

$$
\frac{\partial \errfunc}{\partial \weight_{9}} = 2(\sigout - \sigoutdes) \cdot f'(\pot) \cdot \sigout_3
$$

#### Naive detailed computation

Let's write the forward computation in a (naive) detailed way.

In [None]:
grad_E_w9 = 2 * (y_pred - y_true) * y3
grad_E_w9

#### Algebraic computation

Let's rewrite the forward computation in a less naive way (using linear algebra).