# [1.2.5 Testowanie różnych funkcji aktywacji](http://pages.mini.pw.edu.pl/~karwowskij/mioad/lab-sieci.html#org7777810)

Należy rozszerzyć istniejącą implementację sieci i metody uczącej o możliwość wyboru funkcji aktywacji:

* sigmoid,
* liniowa,
* tanh,
* ReLU.

Porównać szybkość uczenia i skuteczność sieci w zależności od liczby neuronów w poszczególnych warstwach i rodzaju funkcji aktywacji. Należy wziąć pod uwagę fakt, że różne funkcje aktywacji mogą dawać różną skuteczność w zależności od liczby neuronów i liczby warstw. Sprawdzić sieci z jedną, dwiema i trzema warstwami ukrytymi. Podobnie jak w poprzednim tygodniu, trzeba dostosować proces uczenia do pochodnych nowych funkcji aktywacji.

Przeprowadzić testy na zbiorach:

* regresja
    * steps-large,
    * multimodal-large
* klasyfikacja
    * rings5-regular
    * rings3-regular


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import metrics
import copy

## Model

In [51]:

class MLP:

    def __init__(self, layers, weights, biases, activation = 'sigmoid', output_function='softmax', visualize_weights=False):
        self.layers = copy.deepcopy(layers)
        self.weights = copy.deepcopy(weights)
        self.biases = copy.deepcopy(biases)

        self.derivative_w = []
        self.derivative_b = []
        self.visualize_weights = visualize_weights

        if output_function == 'softmax':
            self.function = self.softmax

        elif output_function == 'linear':
            self.function = self.linear
        else : 
            raise ValueError(f'No output function named {output_function} available')
        
        if activation == 'sigmoid':
            self.activation_function = self.sigmoid
            self.grad = self.sigmoidGradient
        elif activation == 'relu' : 
            self.activation_function = self.relu
            self.grad = self.reluGradient
        elif activation =='tanh' : 
            self.activation_function = self.tanh
            self.grad = self.tanhGradient
        elif activation =='linear' : 
            self.activation_function = self.linear
            self.grad = self.linearGradient
            
        

    def forward(self, inputs):
        """
        performs forward propagation
        """
        # activations and linear combinations passed to activation function
        self.activations = []
        self.z_values = []

        activations = inputs
        self.activations.append(activations)
        for i in range(len(self.layers) - 2):
            outputs = activations @ self.weights[i] + self.biases[i]
            self.z_values.append(outputs)
            activations = self.activation_function(outputs)
            self.activations.append(activations)

        self.weights[-1].shape
        results = activations @ self.weights[-1] + self.biases[-1]
        self.z_values.append(results)
        activations = self.function(results)
        self.activations.append(activations)
        return activations

    def backpropagation(self, y):
        deltas = [None] * len(self.weights)

        if self.function == self.softmax:
            out = []
            for elem_x, elem_y in zip(self.activations[-1], y):
                error = elem_y - elem_x
                x = elem_x.reshape(elem_x.shape[0], )
                si_sj = - x * x.reshape(self.layers[-1], 1)
                s_der = np.diag(x) + si_sj
                out.append(s_der @ error)

            out = np.array(out)
            deltas[-1] = copy.deepcopy(out)
            
        elif self.function == self.linear:
            deltas[-1] = y - self.activations[-1]

        for i in reversed(range(len(deltas) - 1)):
            deltas[i] = ((self.weights[i + 1] @ deltas[i + 1].T) * self.grad(self.z_values[i]).T).T

        m = y.shape[0]

        derivative_b = [None] * len(deltas)
        derivative_w = [None] * len(deltas)

        for i, d in enumerate(deltas):
            derivative_w[i] = (d.T @ self.activations[i]).T / m
            derivative_b[i] = (d.T @ np.ones((m, 1))).T / m

        return derivative_w, derivative_b

    def train(self, x, y, batch_size=20, epochs=500, alpha=0.1, verbose=False, momentum=False, rmsprop=False,
              lambda_moment=0.5, beta=0.5):

        p = np.random.permutation(len(y))

        if self.function == self.softmax:
            b = np.zeros((y.size, y.max() + 1))
            b[np.arange(y.size), y.flatten()] = 1
            y = b

        x = x[p]
        y = y[p]

        momentum_w = [np.zeros(w.shape) for w in self.weights]
        momentum_b = [np.zeros(b.shape) for b in self.biases]

        rmsprop_w = [np.zeros(w.shape) for w in self.weights]
        rmsprop_b = [np.zeros(b.shape) for b in self.biases]

        for epoch in range(epochs):
            i = 0

            while i < len(y):
                x_batch = x[i:i + batch_size]
                y_batch = y[i:i + batch_size]
                i = i + batch_size
                y_hat = self.forward(x_batch)
                derivative_w, derivative_b = self.backpropagation(y_batch)

                for j, weight in enumerate(self.weights):

                    if momentum:
                        momentum_w[j] = momentum_w[j] * lambda_moment + derivative_w[j]
                        self.weights[j] = weight + alpha * momentum_w[j]

                    elif rmsprop:
                        rmsprop_w[j] = beta * rmsprop_w[j] + (1 - beta) * derivative_w[j] ** 2
                        self.weights[j] = weight + alpha * (derivative_w[j] / np.sqrt(rmsprop_w[j]))

                    else:
                        self.weights[j] = weight + alpha * derivative_w[j]

                for j, bias in enumerate(self.biases):

                    if momentum:
                        momentum_b[j] = momentum_b[j] * lambda_moment + derivative_b[j]
                        self.biases[j] = bias + alpha * momentum_b[j]

                    elif rmsprop:
                        rmsprop_b[j] = beta * rmsprop_b[j] + (1 - beta) * derivative_b[j] ** 2
                        self.biases[j] = bias + alpha * (derivative_b[j] / np.sqrt(rmsprop_b[j]))

                    else:
                        self.biases[j] = bias + alpha * derivative_b[j]

            if self.visualize_weights:
                if epoch % 100 == 0:
                    print("Error in epoch {} = {}".format(epoch, np.linalg.norm(self.activations[-1] - y_batch)))
                    network_structure = np.asarray(layers)
                    network = visNN.DrawNN(network_structure, self.weights)
                    network.draw()
            else:
                if self.function == self.softmax:
                    print(f"Cross entropy loss in epoch {epoch} = {metrics.log_loss(np.argmax(y_batch, axis =1), self.activations[-1])}", end='\r')
                else:
                    print(f"Custom error in epoch {epoch} = { np.abs(y_batch -  self.activations[-1]).mean()}", end='\r')
    
    # ------------------------------ ACTIVATIONS ------------------------------ # 
    
    @staticmethod
    def softmax(x):
        e_x = np.exp(x.T - np.max(x, axis=1).T).T
        return (e_x.T / e_x.sum(axis=1).T).T
    
    @staticmethod
    def relu(x): 
        return np.maximum(x, 0)
    
    @staticmethod
    def tanh(x):
        return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

    @staticmethod
    def linear(x): 
        return x
    
    @staticmethod
    def sigmoid(x):
        return np.exp(x) / (1 + np.exp(x))
     
    # ------------------------------- GRADIENTS ------------------------------- # 
    
    @staticmethod
    def sigmoidGradient(x):
        def sigmoid(x):
            return np.exp(x) / (1 + np.exp(x))
        return sigmoid(x) * (1 - sigmoid(x))
    
    @staticmethod
    def reluGradient(x): 
        return np.where(x > 0, 1, 0)
    
    @staticmethod
    def tanhGradient(x):
        def tanh(x):
            return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
        return 1.0 - tanh(x)**2
    
    @staticmethod
    def linearGradient(x):
        return np.ones(x.shape)
    

In [40]:
def generate_weights_and_biases(layers, lower, upper):

    weights = []
    biases = []

    for i in range(len(layers) - 1):
        weights.append(np.random.uniform(lower, upper, layers[i] * layers[i + 1]).reshape(layers[i], layers[i + 1]))
        biases.append(np.random.uniform(lower, upper, layers[i + 1]).reshape(1, layers[i+1]))

    return weights, biases

# Testowanie implementacji
* Na początek będziemy patrzyć na zadanie regresji, a następnie na zadanie klasyfikacji
* Przetestujemy po 3 architektury sieci dla każdego problemu - jedna warstwa, dwie warstwy i trzy warstwy. Aby przeskalować jeszcze bardziej te sieci pierwsza architektura będzie miała 8 neuronów w warstwie ukrytej, druga 16 a ostatnia 32.
* Aby eksperyment był uczciwy porównamy również 3 różne inicjalizacje wag `[-1,1]` oraz 3 różne `learning rate` - 0.1, 0.01 oraz 0.001

In [50]:
epochs_range = np.array([10, 50, 100, 200, 300, 500, 800, 1000])

In [52]:
train_df = pd.read_csv('../data/square-simple-training.csv', index_col=0)
test_df = pd.read_csv('../data/square-simple-test.csv', index_col=0)

x = np.asarray(train_df['x']).reshape(-1, 1)
y = np.asarray(train_df['y']).reshape(-1, 1)
x_test = np.asarray(test_df['x']).reshape(-1,1)

In [None]:
mean_results_l = []
mean_results_s = []
mean_results_t = []
mean_results_r = []


for num_epochs in epochs_range:
    results_l = {}
    results_s = {}
    results_t = {}
    
    
    for alpha in [0.1, 0.01, 0.001]:
        results_l[str(alpha)] = []
        results_s[str(alpha)] = []
        results_t[str(alpha)] = []
        results_r[str(alpha)] = []
        
        
        for _ in range(3):
            weights, biases = generate_weights_and_biases(layers, -1, 1)

            mlp_l = MLP(layers, weights, biases, output_function='linear', activation = 'linear')
            mlp_s = MLP(layers, weights, biases, output_function='linear', activation = 'sigmoid')
            mlp_t = MLP(layers, weights, biases, output_function='linear', activation = 'tanh')
            mlp_r = MLP(layers, weights, biases, output_function='linear', activation = 'relu')
            
            mlp_l.train(x, y, epochs = num_epochs, alpha=alpha)
            mlp_s.train(x, y, epochs = num_epochs, alpha=alpha)
            mlp_t.train(x, y, epochs = num_epochs, alpha=alpha)
            mlp_r.train(x, y, epochs = num_epochs, alpha=alpha)
            
            predictions_l = mlp_l.forward(x_test)
            predictions_s = mlp_s.forward(x_test)
            predictions_t = mlp_t.forward(x_test)
            predictions_r = mlp_r.forward(x_test)
            
            results_l.append(metrics.mean_absolute_error(test_df['y'], predictions_l))
            results_s.append(metrics.mean_absolute_error(test_df['y'], predictions_s))
            results_t.append(metrics.mean_absolute_error(test_df['y'], predictions_t))
            results_r.append(metrics.mean_absolute_error(test_df['y'], predictions_r))
            
            
    mean_results_base.append(np.array(results_b).mean())
    mean_results_moment.append(np.array(results_m).mean())
    mean_results_rmsprop.append(np.array(results_r).mean())