## Importing libs

In [1]:
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
plt.rcParams['figure.figsize'] = [2, 2]
sns.set() # apply the seaborn defaults to plotted figures (e.g. theme, scaling, color palette), instead of matplotlib's

## Functions and derivatives

In [3]:
def linear(z):
    return z

def relu(z):
    return np.maximum(0, z)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def softmax(y_pred, axis=-1):
    exp = np.exp(y_pred)
    return exp / np.sum(exp, axis=axis, keepdims=True)

def cross_entropy(y, y_pred):
    return -np.sum(y * log(y_pred))

In [4]:
def grad_linear(z):
    return np.ones(shape=z.shape)

def grad_relu(z):
    return np.where(z > 0, 1, 0)

def grad_sigmoid(z):
    return z * (1 - z)

def grad_softmax(y_pred):
    # y_pred[i]*(1-y_pred[j]) if i != j --> y_pred[i] - y_pred[i] * y_pred[j]
    # -y_pred[i]*y_pred[j]    if i == j -->     0     - y_pred[i] * y_pred[j]
    y_pred = y_pred.reshape(-1, 1)
    return np.diagflat(y_pred) - np.dot(y_pred, y_pred.T)

def grad_cross_entropy(y, y_pred):
    return y_pred - y # FIXME

## Neural Network

From a layer $k-1$ to a layer $k$ we have:
- Weights $W \in \mathbb{R}^{n_k \times n_{k-1}}$
- Biases $b \in \mathbb{R}^{n_k}$
- Activations $a^{(k)} = g_k(z^{(k)})$, where $g_k(z^{(k)})$ is the activation function and $z^{(k)} = (W^{(k)})^T a^{(k-1)} + b^{(k)}$

(Xavier initialization: [[1]](https://prateekvjoshi.com/2016/03/29/understanding-xavier-initialization-in-deep-neural-networks/))

For the first layer, the activation is the input itself: $a^{(1)} = x$  
For the middle layers ($2 \leq k < L$), the activation function is the sigmoid: $a^{(k)} = g_k(z^{(k)}) = sigmoid(z^{(k)})$  
For the last layer, we have the predicted value with softmax activation: $a^{(L)} = g_k(z^{(L)}) = softmax(z^{(L)})$  
(i.e. the hypothesis function $a^{(L)} = h_{W, b}(x) = y_{\text{pred}} \approx y$)

obs.: the number of layers $L$ comes from: $1$ input layer + $1$ output layer + $L-2$ hidden layers

In [6]:
import warnings

# RANDOM_SEED = 886

In [None]:
class Layer:
    def __init__(self, input_size, output_size, 
                 activation_function, activation_function_derivative, 
                 weight_initialization='xavier'):
        
        if weight_initialization == 'xavier':
            stddev = np.sqrt(1 / input_size)
            self.W = stddev * np.random.randn(output_size, input_size)
            self.b = stddev * np.random.randn(output_size, 1)
        elif weight_initialization == 'xavier_avg':
            stddev = np.sqrt(2 / (input_size + output_size))
            self.W = stddev * np.random.randn(output_size, input_size)
            self.b = stddev * np.random.randn(output_size, 1)
        else:
            raise ValueError(f"Invalid weight_initialization value: '{weight_initialization}'")
        
        self.a = np.array(shape=(output_size, 1)) # self.a[i] is the “activation” of the i-th unit in the layer
        self.g = activation_function
        self.g_prime = activation_function_derivative

class NN:
    def __init__(self):
        self.layers = []
        # TODO
        
    def predict(self, x):
        activation = x
        for l in range(1, L):
            z = self.layers[l].W @ activation + self.layers[l].b
            activation = self.layers[l].g(z)
        return activation
    
    def feed_forward(self, x):
        L = len(self.layers)
        # note that we use zero-based indexing here, so
        # the 1st layer is self.layers[0] and the last is self.layers[L - 1]
        self.layers[0].a = x # input
        for l in range(1, L):
            z = self.layers[l].W @ self.layers[l-1].a + self.layers[l].b
            self.layers[l].a = self.layers[l].g(z) # apply the activation function g to the weighted sum z
        y_pred = self.layers[L] # output
        return y_pred
    
    def backprop(self):
        pass
    
    def train(self, x, y):
        y_pred = self.feed_forward(x)        
        #TODO
        pass
            
    # TODO fit(trainning data), evaluate(validation data)
        