# Shallow Neuron Network

An example of simple 2 layers Neuron Network

<img src="imgs/shallow-nn.png" style="width:500px;float:right;" />

## Parameters:

$
\begin{align}
\text{number of layers : }   & L     & = 2 \\
\text{input features : }     & n_0   & = 2 \\
\text{neurons in layer 1 : } & n_1   & = 3 \\
\text{output : }             & n_2   & = 1 \\
\text{training samples : }   & m     & = 4 \\
\end{align}
$

## Forward Propagation : Layer 1

$
X = \begin{bmatrix}
x_1^{(1)} & x_1^{(2)} & x_1^{(3)} & x_1^{(4)} \\
x_2^{(1)} & x_2^{(2)} & x_2^{(3)} & x_2^{(4)}
\end{bmatrix}_{2 \times 4}
\\ \ 
\\
W^{[1]} = \begin{bmatrix}
w_{11}^{[1]} & w_{12}^{[1]} & w_{13}^{[1]} \\
w_{21}^{[1]} & w_{22}^{[1]} & w_{23}^{[1]}
\end{bmatrix}_{2 \times 3}
\\ \ 
\\
b^{[1]} = \begin{bmatrix} b^{[1]}_1 \\ b^{[1]}_2 \\ b^{[1]}_3 \end{bmatrix}
$

$
Z^{[1]} = W^{[1]\ T} X + b = 
\begin{bmatrix}
w_{11}^{[1]} & w_{21}^{[1]} \\
w_{12}^{[1]} & w_{22}^{[1]} \\
w_{13}^{[1]} & w_{23}^{[1]}
\end{bmatrix} \ \ 
\begin{bmatrix}
x_1^{(1)} & x_1^{(2)} & x_1^{(3)} & x_1^{(4)} \\
x_2^{(1)} & x_2^{(2)} & x_2^{(3)} & x_2^{(4)}
\end{bmatrix} \ + \ 
\begin{bmatrix} b^{[1]}_1 \\ b^{[1]}_2 \\ b^{[1]}_3 \end{bmatrix}
$

$
Z^{[1]} =
\begin{bmatrix}
w_{11}x_1^{(1)} + w_{21}x_2^{(1)} + b^{[1]}_1 &
w_{11}x_1^{(2)} + w_{21}x_2^{(2)} + b^{[1]}_1 &
\dots^{(3)} & \dots^{(4)} \\
w_{12}x_1^{(1)} + w_{22}x_2^{(1)} + b^{[1]}_2 &
w_{12}x_1^{(2)} + w_{22}x_2^{(2)} + b^{[1]}_2 &
\dots^{(3)} & \dots^{(4)} \\
w_{13}x_1^{(1)} + w_{23}x_2^{(1)} + b^{[1]}_3 &
w_{13}x_1^{(2)} + w_{23}x_2^{(2)} + b^{[1]}_3 &
\dots^{(3)} & \dots^{(4)} \\
\end{bmatrix}_{3 \times 4}
$

$
A^{[1]} = g^{[1]}\big( Z^{[1]} \big) = 
\begin{bmatrix}
a^{[1](1)}_1 & a^{[1](2)}_1 & a^{[1](3)}_1 & a^{[1](4)}_1 \\
a^{[1](1)}_2 & a^{[1](2)}_2 & a^{[1](3)}_2 & a^{[1](4)}_2 \\
a^{[1](1)}_3 & a^{[1](2)}_3 & a^{[1](3)}_3 & a^{[1](4)}_3 
\end{bmatrix}_{3 \times 4}, \ \ \ 
g^{[1]} = \tanh
$

## Forward Propagation : Layer 2

$
W^{[2]} = \begin{bmatrix}
w_{11}^{[2]} \\ w_{21}^{[2]} \\ w_{31}^{[2]}
\end{bmatrix}_{3 \times 1}
\\ \ 
\\
b^{[2]} \in \mathcal{R}
$

$
Z^{[2]} = W^{[2]\ T} A^{[1]} + b^{[2]} =
\begin{bmatrix}
w_{11}^{[2]} & w_{21}^{[2]} & w_{31}^{[2]}
\end{bmatrix} \ \ 
\begin{bmatrix}
a^{[1](1)}_1 & a^{[1](2)}_1 & a^{[1](3)}_1 & a^{[1](4)}_1 \\
a^{[1](1)}_2 & a^{[1](2)}_2 & a^{[1](3)}_2 & a^{[1](4)}_2 \\
a^{[1](1)}_3 & a^{[1](2)}_3 & a^{[1](3)}_3 & a^{[1](4)}_3 
\end{bmatrix}
$

$
Z^{[2]} = \begin{bmatrix}
w^{[2]}_{11} a^{[1](1)}_1 + w^{[2]}_{21} a^{[1](1)}_2 + w^{[2]}_{31} a^{[1](1)}_3 + b^{[2]} &
w^{[2]}_{11} a^{[1](2)}_1 + w^{[2]}_{21} a^{[1](2)}_2 + w^{[2]}_{31} a^{[1](2)}_3 + b^{[2]} &
\dots^{(3)} & \dots^{(4)}
\end{bmatrix}_{1 \times 4}
$

$
A^{[2]} = g^{[2]} \big( Z^{[2]} \big) = 
\begin{bmatrix} a^{[2](1)} & a^{[2](2)} & a^{[2](3)} & a^{[2](4)} \end{bmatrix},
\ \ g^{[2]} = \sigma
\\
\hat{y}^{(i)} = a^{[2](i)}
$

## Lost &amp; Cost Function

$
\mathcal{L} \big( \hat{y}, y \big) = 
- \Big(
y \log\hat{y} + \big( 1 - y \big) \log \big( 1 - \hat{y} \big)
\Big)
$

$
\mathcal{J} \big( w^{[1]}, b^{[1]}, w^{[2]}, b^{[2]} \big) = 
\frac{1}{m} \sum_{i=1}^m \mathcal{L} \big( \hat{y}, y \big)
$

## Backwoard Propagation : Layer 2

$
\begin{align}
\text{"da2"} & = \frac{d}{d a^{[2]}} \mathcal{L} \big( a^{[2]}, y \big) \\
& = \frac{d}{d a^{[2]}} - \Big(
y \log a^{[2]} + \big( 1 - y \big) \log \big( 1 - a^{[2]} \big)
\Big) \\
& = \frac{-y}{a^{[2]}} + \frac{1-y}{1-a^{[2]}}
\end{align}
$

$
\begin{align}
\text{"dz2"} & = \frac{d}{d z^{[2]}} \mathcal{L} \big( a^{[2]}, y \big) \\
& = \frac{d}{d z^{[2]}} a^{[2]} \ \ \frac{d}{d a^{[2]}} \mathcal{L} \big( a^{[2]}, y \big) \\
& = \Big( \frac{d}{d z^{[2]}} \ a^{[2]} \Big) \ \Big( \frac{-y}{a^{[2]}} + \frac{1-y}{1-a^{[2]}} \Big) \\
& = \Big( \frac{d}{d z^{[2]}} \ \sigma\big( z^{[2]} \big) \Big) \ \Big( \frac{-y}{a^{[2]}} + \frac{1-y}{1-a^{[2]}} \Big) \\
& = \Big( \sigma( z^{[2]}) \big( 1 - \sigma( z^{[2]} ) \big) \Big) \ \Big( \frac{-y}{a^{[2]}} + \frac{1-y}{1-a^{[2]}} \Big) \\
& = a^{[2]} \big( 1 - a^{[2]} \big) \Big( \frac{-y}{a^{[2]}} + \frac{1-y}{1-a^{[2]}} \Big) \\
& = a^{[2]} - y
\end{align}
$

$
\begin{align}
\text{"dw2"} & = \frac{d}{d w^{[2]}} \mathcal{L} \big( a^{[2]}, y \big) \\
& = \frac{d}{d w^{[2]}} z^{[2]} \times \frac{d}{d z^{[2]}} \mathcal{L} \big( a^{[2]}, y \big) \\
& = \Big( \frac{d}{d w^{[2]}} ( w^{[2]} a^{[1]} + b^{[2]} ) \Big) \times \Big( a^{[2]} - y \Big) \\
& = a^{[1]} \times \Big( a^{[2]} - y \Big)
\end{align}
$

$
\begin{align}
\text{"db2"} & = \frac{d}{d b^{[2]}} \mathcal{L} \big( a^{[2]}, y \big) \\
& = \frac{d}{d b^{[2]}} z^{[2]} \times \frac{d}{d z^{[2]}} \mathcal{L} \big( a^{[2]}, y \big) \\
& = \Big( \frac{d}{d b^{[2]}} ( w^{[2]T} a^{[1]} + b^{[2]} ) \Big) \times \Big( a^{[2]} - y \Big) \\
& = 1 \times \Big( a^{[2]} - y \Big) \\
& = a^{[2]} - y
\end{align}
$

## Backwoard Propagation : Layer 1

$
\begin{align}
\text{"dz1"} & = \frac{d}{d z^{[1]}} \mathcal{L} \big( a^{[2]}, y \big) \\
& = \frac{d}{d z^{[1]}} z^{[2]} \times \frac{d}{d z^{[2]}} \mathcal{L} \big( a^{[2]}, y \big) \\
& = \frac{d}{d z^{[1]}} \Big[ w^{[2]T} a^{[1]} + b^{[2]} \Big] \times \Big[ a^{[2]} - y \Big] \\
& = \frac{d}{d z^{[1]}} \Big[ w^{[2]T} g^{[1]}(z^{[1]}) + b^{[2]} \Big] \times \Big[ a^{[2]} - y \Big] \\
& = \Big[ w^{[2]T} g^{[1]\prime}(z^{[1]}) \Big] \times \Big[ a^{[2]} - y \Big] \\
& = w^{[2]T} \  \big( a^{[2]} - y \big) \ g^{[1]\prime}(z^{[1]}) \\
& = w^{[2]T} \  \big( a^{[2]} - y \big) \ \big( 1 - tanh(z^{[1]})^2 \big) \\
\end{align}
$

$
\begin{align}
\text{"dw1"} & = \frac{d}{dw^{[1]}} \mathcal{L}\big( a^{[2]}, y \big) \\
& = \frac{d}{dw^{[1]}} z^{[1]} \times \frac{d}{dz^{[1]}} \mathcal{L}\big( a^{[2]}, y \big) \\
& = \frac{d}{dw^{[1]}} \big( w^{[1]T} x + b^{[1]} \big) \times \frac{d}{dz^{[1]}} \mathcal{L}\big( a^{[2]}, y \big) \\
& = x \times \text{"dz1"}
\end{align}
$

$
\begin{align}
\text{"db1"} & = \text{"dz1"}
\end{align}
$