## Importing libs

In [1]:
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
plt.rcParams['figure.figsize'] = [2, 2]
sns.set() # apply the seaborn defaults to plotted figures (e.g. theme, scaling, color palette), instead of matplotlib's

## Functions and derivatives

In [3]:
def linear(z):
    return z

def relu(z):
    return np.maximum(0, z)

def sigmoid(z, limit=500):
    if limit != None:
        z = np.clip(z, -limit, limit) # avoid overflow
    return 1 / (1 + np.exp(-z))

def softmax(y_pred, axis=-1):
    exp = np.exp(y_pred)
    return exp / np.sum(exp, axis=axis, keepdims=True)

''' y.shape == y_pred.shape == (m, C), where:
    - m is the number of examples
    - C is the number of classes 
    Thus, each row of y and y_pred is a one-hot encoded vector of shape (1, C)
'''    

def cross_entropy(y, y_pred, axis=-1, eps=1e-12):
    if eps != None:
        y_pred = np.clip(y_pred, eps, 1 - eps) # avoid overflow
    m = y_pred.shape[0]
    return -np.sum(y * log(y_pred), axis=axis) / m

def xent(y, y_pred):
    return -np.sum(y * log(y_pred))

In [4]:
def grad_linear(z):
    return np.ones(shape=z.shape)

def grad_relu(z):
    return np.where(z > 0, 1, 0)

def grad_sigmoid(z):
    sigmoid_z = sigmoid(z)
    return sigmoid_z * (1 - sigmoid_z)

def grad_softmax(y_pred):
    # y_pred[i]*(1-y_pred[j]) if i != j --> y_pred[i] - y_pred[i] * y_pred[j]
    # -y_pred[i]*y_pred[j]    if i == j -->     0     - y_pred[i] * y_pred[j]
    y_pred = y_pred.reshape(-1, 1)
    return np.diagflat(y_pred) - np.dot(y_pred, y_pred.T)

def grad_cross_entropy(y, y_pred, axis=-1):
    return y_pred - y # FIXME

## Neural Network

From a layer $k-1$ to a layer $k$ we have:
- Weights $W \in \mathbb{R}^{n_k \times n_{k-1}}$
- Biases $b \in \mathbb{R}^{n_k}$
- Activations $a^{(k)} = g_k(z^{(k)}) \in \mathbb{R}^{n_k}$, where $g_k(z^{(k)})$ is the activation function of the $k^{\text{th}}$ layer and $z^{(k)} = W^{(k)} a^{(k-1)} + b^{(k)}$

(Xavier initialization: [[1]](https://prateekvjoshi.com/2016/03/29/understanding-xavier-initialization-in-deep-neural-networks/))

For the first layer, the activation is the input itself: $a^{(1)} = x$  
For the middle layers ($2 \leq k < L$), the activation function is the sigmoid: $a^{(k)} = g_k(z^{(k)}) = sigmoid(z^{(k)})$  
For the last layer, we have the predicted value with softmax activation: $a^{(L)} = g_k(z^{(L)}) = softmax(z^{(L)})$  
(i.e. the hypothesis function $a^{(L)} = h_{W, b}(x) = y_{\text{pred}} \approx y$)

obs.: the number of layers $L$ comes from: $1$ input layer + $1$ output layer + $L-2$ hidden layers

In [6]:
import warnings

# RANDOM_SEED = 886

In [None]:
''' An ActivationFunction is applied to Z to get the output A, 
    but it's derivative expects the value A, not Z (!):
    
    A == __call__(Z) and derivative(A) == derivative(__call__(Z)), 
    calling derivative(Z) will often yield WRONG results
'''
class ActivationFunction:
    def __call__(self, Z):
        ''' `Z`.shape=(n_examples, output_size) '''
        raise NotImplementedError    
    def derivative(self, A):
        ''' `A`.shape=(n_examples, output_size) '''
        raise NotImplementedError

class Linear(ActivationFunction):
    def __call__(self, Z):
        return Z
    def derivative(self, A):
        return np.ones_like(A)

class Sigmoid(ActivationFunction):
    def __call__(self, Z):
        return 1 / (1 + np.exp(-Z))
    def derivative(self, A):
        return A * (1 - A) # Sigmoid(Z) * (1 - Sigmoid(Z))

class ReLU(ActivationFunction):
    def __call__(self, Z):
        return np.maximum(0, Z)
    def derivative(self, A):
        return np.where(A > 0, 1, 0)

class SoftMax(ActivationFunction):
    def __call__(self, Z):
        # e^{x+c} / sum(e^{x+c}) == (e^x * e^c) / (e^c * sum(e^x)) == e^x / sum(e^x)
        exp = np.exp(Z - Z.max(axis=-1, keepdims=True))
        return exp / np.sum(exp, axis=-1, keepdims=True)
    def derivative(self, A):
        raise NotImplementedError # FIXME ref.: https://medium.com/@aerinykim/how-to-implement-the-softmax-derivative-independently-from-any-loss-function-ae6d44363a9d


In [None]:
class Layer:
    def __init__(self, input_size, output_size, activation_function, 
                 weight_initialization='xavier'):
        
        self.input_size  = input_size
        self.output_size = output_size
        
        self.g = activation_function
        self.g_prime = activation_function_derivative
        
        # activation values shape=(output_size, 1)
        self.a = None # self.g(self.z)
        self.z = None # prev_layer.a @ self.W + self.b
        
        if weight_initialization == 'xavier':
            stddev = np.sqrt(1 / input_size)
            self.W = stddev * np.random.randn(input_size, output_size)
            self.b = np.random.randn(output_size, 1)
        elif weight_initialization == 'xavier avg':
            stddev = np.sqrt(2 / (input_size + output_size))
            self.W = stddev * np.random.randn(input_size, output_size)
            self.b = np.random.randn(output_size, 1)
        elif weight_initialization == '-1 to 1':
            self.W = 2 * np.random.randn(input_size, output_size) - 1
            self.b = 2 * np.random.randn(output_size, 1) - 1
        else:
            raise ValueError(f"Invalid weight_initialization value: '{weight_initialization}'")
    
    @property
    def params_count(self):
        return self.W.size + self.b.size
    
    # receives the input of the current layer
    # returns the activation value of the current layer
    def feedforward(self, X):
        ''' `X`.shape=(n_examples, `self.input_size`) '''
        assert(X.shape[1] == self.input_size)
        # (n_examples, output_size) = (n_examples, input_size) @ (input_size, output_size) + (output_size, )
        self.z = X @ self.W + self.b
        self.a = self.g(self.z)
        return self.a
    
    # receives the derivative of J w.r.t. the next layer's of the current layer
    # returns the derivative of the cost function w.r.t. activation value of the current layer
    def backprop(self, delta):
        ''' `delta`.shape '''
        delta = delta * self.g_prime(self.z) # self.g_prime(self.z)
        delta = delta * self.g_prime()
        # a = g(z)
        # g'(z) = da/dz
        raise NotImplementedError

class NN:
    def __init__(self, layers, cost_function, cost_function_derivative):
        self.layers = []
    
    def add_layer(self, layer):
        self.layers.append(layer)
        
    def predict(self, X):
        activation = X
        for l in range(1, L):
            z = self.layers[l].W @ activation + self.layers[l].b
            activation = self.layers[l].g(z)
        return activation
    
    def feed_forward(self, X):
        L = len(self.layers)
        # note that we use zero-based indexing here, so
        # the 1st layer is self.layers[0] and the last is self.layers[L - 1]
        self.layers[0].a = X # input
        for l in range(1, L):
            self.layers[l].z = self.layers[l].W @ self.layers[l-1].a + self.layers[l].b
            self.layers[l].a = self.layers[l].g(self.layers[l].z) # apply the activation function g to the weighted sum z
        y_pred = self.layers[L-1] # output
        return y_pred
    
    def backprop(self, X, y, learning_rate):
        L = len(self.layers) # number of layers
        m = X.shape[0]       # number of examples
        
        y_pred = self.feed_forward(x) # == self.layers[L-1].a
        cost = self.J(y, y_pred)
        
        self.layers[L-1].error = y_pred - y
        self.layers[L-1].delta = self.layers[L-1].error * self.layers[L-1].g_prime(y_pred)
        for l in reversed(range(L-1)):
            self.layers[l].error = (self.layers[l+1].W).T @ self.layers[l+1].delta
            self.layers[l].delta = self.layers[l].error * self.layers[l].g_prime(self.layers[l].a)
        
        # TODO gradient descent
        
        pass
    
    def train(self, x, y):
        #TODO
        pass
            
    # TODO fit(trainning data), evaluate(validation data)
        