# [1.2.5 Testowanie różnych funkcji aktywacji](http://pages.mini.pw.edu.pl/~karwowskij/mioad/lab-sieci.html#org7777810)

Należy rozszerzyć istniejącą implementację sieci i metody uczącej o możliwość wyboru funkcji aktywacji:

* sigmoid,
* liniowa,
* tanh,
* ReLU.

Porównać szybkość uczenia i skuteczność sieci w zależności od liczby neuronów w poszczególnych warstwach i rodzaju funkcji aktywacji. Należy wziąć pod uwagę fakt, że różne funkcje aktywacji mogą dawać różną skuteczność w zależności od liczby neuronów i liczby warstw. Sprawdzić sieci z jedną, dwiema i trzema warstwami ukrytymi. Podobnie jak w poprzednim tygodniu, trzeba dostosować proces uczenia do pochodnych nowych funkcji aktywacji.

Przeprowadzić testy na zbiorach:

* regresja
    * steps-large,
    * multimodal-large
* klasyfikacja
    * rings5-regular
    * rings3-regular


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import metrics
import copy

## Model

In [104]:

class MLP:

    def __init__(self, layers, weights, biases, activation = 'sigmoid', output_function='softmax', visualize_weights=False , max_class = None):
        self.layers = copy.deepcopy(layers)
        self.weights = copy.deepcopy(weights)
        self.biases = copy.deepcopy(biases)
        self.max_class = max_class

        self.derivative_w = []
        self.derivative_b = []
        self.visualize_weights = visualize_weights

        if output_function == 'softmax':
            self.function = self.softmax

        elif output_function == 'linear':
            self.function = self.linear
        else : 
            raise ValueError(f'No output function named {output_function} available')
        
        if activation == 'sigmoid':
            self.activation_function = self.sigmoid
            self.grad = self.sigmoidGradient
        elif activation == 'relu' : 
            self.activation_function = self.relu
            self.grad = self.reluGradient
        elif activation =='tanh' : 
            self.activation_function = self.tanh
            self.grad = self.tanhGradient
        elif activation =='linear' : 
            self.activation_function = self.linear
            self.grad = self.linearGradient
            
        

    def forward(self, inputs):
        """
        performs forward propagation
        """
        # activations and linear combinations passed to activation function
        self.activations = []
        self.z_values = []

        activations = inputs
        self.activations.append(activations)
        for i in range(len(self.layers) - 2):
            outputs = activations @ self.weights[i] + self.biases[i]
            self.z_values.append(outputs)
            activations = self.activation_function(outputs)
            self.activations.append(activations)

        self.weights[-1].shape
        results = activations @ self.weights[-1] + self.biases[-1]
        self.z_values.append(results)
        activations = self.function(results)
        self.activations.append(activations)
        return activations

    def backpropagation(self, y):
        deltas = [None] * len(self.weights)

        if self.function == self.softmax:
            out = []
            for elem_x, elem_y in zip(self.activations[-1], y):
                error = elem_y - elem_x
                x = elem_x.reshape(elem_x.shape[0], )
                si_sj = - x * x.reshape(self.layers[-1], 1)
                s_der = np.diag(x) + si_sj
                out.append(s_der @ error)

            out = np.array(out)
            deltas[-1] = copy.deepcopy(out)
            
        elif self.function == self.linear:
            deltas[-1] = y - self.activations[-1]

        for i in reversed(range(len(deltas) - 1)):
            deltas[i] = ((self.weights[i + 1] @ deltas[i + 1].T) * self.grad(self.z_values[i]).T).T

        m = y.shape[0]

        derivative_b = [None] * len(deltas)
        derivative_w = [None] * len(deltas)

        for i, d in enumerate(deltas):
            derivative_w[i] = (d.T @ self.activations[i]).T / m
            derivative_b[i] = (d.T @ np.ones((m, 1))).T / m

        return derivative_w, derivative_b

    def train(self, x, y, batch_size=20, epochs=500, alpha=0.1, verbose=False, momentum=False, rmsprop=False,
              lambda_moment=0.5, beta=0.5):

        p = np.random.permutation(len(y))

        if self.function == self.softmax:
            if self.max_class is not None: 
                
                b = np.zeros((y.size, self.max_class + 1))
                b[np.arange(y.size), y.flatten()] = 1
            else: 
                b = np.zeros((y.size, y.max() + 1))
                b[np.arange(y.size), y.flatten()] = 1
                
            y = b

        x = x[p]
        y = y[p]

        momentum_w = [np.zeros(w.shape) for w in self.weights]
        momentum_b = [np.zeros(b.shape) for b in self.biases]

        rmsprop_w = [np.zeros(w.shape) for w in self.weights]
        rmsprop_b = [np.zeros(b.shape) for b in self.biases]

        for epoch in range(epochs):
            i = 0

            while i < len(y):
                x_batch = x[i:i + batch_size]
                y_batch = y[i:i + batch_size]
                i = i + batch_size
                y_hat = self.forward(x_batch)
                derivative_w, derivative_b = self.backpropagation(y_batch)

                for j, weight in enumerate(self.weights):

                    if momentum:
                        momentum_w[j] = momentum_w[j] * lambda_moment + derivative_w[j]
                        self.weights[j] = weight + alpha * momentum_w[j]

                    elif rmsprop:
                        rmsprop_w[j] = beta * rmsprop_w[j] + (1 - beta) * derivative_w[j] ** 2
                        self.weights[j] = weight + alpha * (derivative_w[j] / np.sqrt(rmsprop_w[j]))

                    else:
                        self.weights[j] = weight + alpha * derivative_w[j]

                for j, bias in enumerate(self.biases):

                    if momentum:
                        momentum_b[j] = momentum_b[j] * lambda_moment + derivative_b[j]
                        self.biases[j] = bias + alpha * momentum_b[j]

                    elif rmsprop:
                        rmsprop_b[j] = beta * rmsprop_b[j] + (1 - beta) * derivative_b[j] ** 2
                        self.biases[j] = bias + alpha * (derivative_b[j] / np.sqrt(rmsprop_b[j]))

                    else:
                        self.biases[j] = bias + alpha * derivative_b[j]

            if self.visualize_weights:
                if epoch % 100 == 0:
                    print("Error in epoch {} = {}".format(epoch, np.linalg.norm(self.activations[-1] - y_batch)))
                    network_structure = np.asarray(layers)
                    network = visNN.DrawNN(network_structure, self.weights)
                    network.draw()
            else:
                if self.function == self.softmax:
                    print(f"Cross entropy loss in epoch {epoch} = {metrics.log_loss(np.argmax(y_batch, axis =1), self.activations[-1])}", end='\r')
                else:
                    print(f"Custom error in epoch {epoch} = { np.abs(y_batch -  self.activations[-1]).mean()}", end='\r')
    
    # ------------------------------ ACTIVATIONS ------------------------------ # 
    
    @staticmethod
    def softmax(x):
        e_x = np.exp(x.T - np.max(x, axis=1).T).T
        return (e_x.T / e_x.sum(axis=1).T).T
    
    @staticmethod
    def relu(x): 
        return np.maximum(x, 0)
    
    @staticmethod
    def tanh(x):
        return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

    @staticmethod
    def linear(x): 
        return x
    
    @staticmethod
    def sigmoid(x):
        return np.exp(x) / (1 + np.exp(x))
     
    # ------------------------------- GRADIENTS ------------------------------- # 
    
    @staticmethod
    def sigmoidGradient(x):
        def sigmoid(x):
            return np.exp(x) / (1 + np.exp(x))
        return sigmoid(x) * (1 - sigmoid(x))
    
    @staticmethod
    def reluGradient(x): 
        return np.where(x > 0, 1, 0)
    
    @staticmethod
    def tanhGradient(x):
        def tanh(x):
            return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
        return 1.0 - tanh(x)**2
    
    @staticmethod
    def linearGradient(x):
        return np.ones(x.shape)
    

In [3]:
def generate_weights_and_biases(layers, lower, upper):

    weights = []
    biases = []

    for i in range(len(layers) - 1):
        weights.append(np.random.uniform(lower, upper, layers[i] * layers[i + 1]).reshape(layers[i], layers[i + 1]))
        biases.append(np.random.uniform(lower, upper, layers[i + 1]).reshape(1, layers[i+1]))

    return weights, biases

# Testowanie implementacji
* Na początek będziemy patrzyć na zadanie regresji, a następnie na zadanie klasyfikacji
* Przetestujemy po 3 architektury sieci dla każdego problemu - jedna warstwa, dwie warstwy i trzy warstwy. Aby przeskalować jeszcze bardziej te sieci pierwsza architektura będzie miała 8 neuronów w warstwie ukrytej, druga 16 a ostatnia 32.
* Jako learning rate wybiorę według mnie optymalne `alpha` dla danego zbioru 

In [4]:
epochs_range = np.array([10, 50, 100, 200, 300, 500, 800, 1000])

In [68]:
train_df = pd.read_csv('../data/steps-large-training.csv', index_col=0)
test_df = pd.read_csv('../data/steps-large-test.csv', index_col=0)

x = np.asarray(train_df['x']).reshape(-1, 1)
y = np.asarray(train_df['y']).reshape(-1, 1)
x_test = np.asarray(test_df['x']).reshape(-1,1)

In [69]:
mean_results_l = {}
mean_results_s = {}
mean_results_t = {}
mean_results_r = {}


architectures = {
    "1":[1,8,1],
    "2":[1,16,16,1],
    "3":[1,32,32,32,1]
}

alpha = 0.0001
num_epochs = 1000

for architecture in ["1","2","3"]:
    mean_results_l[architecture] = []
    mean_results_s[architecture] = []
    mean_results_t[architecture] = []
    mean_results_r[architecture] = []
    
    layers = architectures[architecture]
        
    results_l = []
    results_s = []
    results_t = []
    results_r = []



    weights, biases = generate_weights_and_biases(layers, -1, 1)

    mlp_l = MLP(layers, weights, biases, output_function='linear', activation = 'linear')
    mlp_s = MLP(layers, weights, biases, output_function='linear', activation = 'sigmoid')
    mlp_t = MLP(layers, weights, biases, output_function='linear', activation = 'tanh')
    mlp_r = MLP(layers, weights, biases, output_function='linear', activation = 'relu')

    mlp_l.train(x, y, epochs = num_epochs, alpha=alpha)
    mlp_s.train(x, y, epochs = num_epochs, alpha=alpha)
    mlp_t.train(x, y, epochs = num_epochs, alpha=alpha)
    mlp_r.train(x, y, epochs = num_epochs, alpha=alpha)

    predictions_l = mlp_l.forward(x_test)
    predictions_s = mlp_s.forward(x_test)
    predictions_t = mlp_t.forward(x_test)
    predictions_r = mlp_r.forward(x_test)

    results_l.append(metrics.mean_absolute_error(test_df['y'], predictions_l))
    results_s.append(metrics.mean_absolute_error(test_df['y'], predictions_s))
    results_t.append(metrics.mean_absolute_error(test_df['y'], predictions_t))
    results_r.append(metrics.mean_absolute_error(test_df['y'], predictions_r))

    mean_results_l[architecture].append(np.array(results_l).mean())
    mean_results_s[architecture].append(np.array(results_s).mean())
    mean_results_t[architecture].append(np.array(results_t).mean())
    mean_results_r[architecture].append(np.array(results_r).mean())
    
            

Custom error in epoch 263 = 17.792111729904356

KeyboardInterrupt: 

In [45]:
print(f'1 hidden layer  with  8 neurons:\nlinear:{round(mean_results_l["1"][0], 3)},\tsigmoid:{round(mean_results_s["1"][0], 3)},\ttanh:{round(mean_results_t["1"][0], 3)},\trelu:{round(mean_results_r["1"][0], 3)}')
print(f'2 hidden layers with 16 neurons:\nlinear:{round(mean_results_l["2"][0], 3)},\tsigmoid:{round(mean_results_s["2"][0], 3)},\ttanh:{round(mean_results_t["2"][0], 3)},\trelu:{round(mean_results_r["2"][0], 3)}')
print(f'3 hidden layers with 32 neurons:\nlinear:{round(mean_results_l["3"][0], 3)},\tsigmoid:{round(mean_results_s["3"][0], 3)},\ttanh:{round(mean_results_t["3"][0], 3)},\trelu:{round(mean_results_r["3"][0], 3)}')

1 hidden layer  with  8 neurons:
linear:20.002,	sigmoid:4.563,	tanh:3.104,	relu:16.582
2 hidden layers with 16 neurons:
linear:20.004,	sigmoid:1.444,	tanh:0.567,	relu:2.344
3 hidden layers with 32 neurons:
linear:20.069,	sigmoid:0.756,	tanh:0.485,	relu:0.575


In [81]:
%%time
train_df = pd.read_csv('../data/multimodal-large-training.csv', index_col=0)
test_df = pd.read_csv('../data/multimodal-large-test.csv', index_col=0)

x = np.asarray(train_df['x']).reshape(-1, 1)
y = np.asarray(train_df['y']).reshape(-1, 1)
x_test = np.asarray(test_df['x']).reshape(-1,1)

mean_results_l = {}
mean_results_s = {}
mean_results_t = {}
mean_results_r = {}


architectures = {
    "1":[1,8,1],
    "2":[1,16,16,1],
    "3":[1,32,32,32,1]
}

alpha = 0.0001
num_epochs = 1000

for architecture in ["1","2","3"]:
    mean_results_l[architecture] = []
    mean_results_s[architecture] = []
    mean_results_t[architecture] = []
    mean_results_r[architecture] = []
    
    layers = architectures[architecture]
    

    results_l = []
    results_s = []
    results_t = []
    results_r = []

    weights, biases = generate_weights_and_biases(layers, -1, 1)

    mlp_l = MLP(layers, weights, biases, output_function='linear', activation = 'linear')
    mlp_s = MLP(layers, weights, biases, output_function='linear', activation = 'sigmoid')
    mlp_t = MLP(layers, weights, biases, output_function='linear', activation = 'tanh')
    mlp_r = MLP(layers, weights, biases, output_function='linear', activation = 'relu')

    mlp_l.train(x, y, epochs = num_epochs, alpha=alpha)
    mlp_s.train(x, y, epochs = num_epochs, alpha=alpha)
    mlp_t.train(x, y, epochs = num_epochs, alpha=alpha)
    mlp_r.train(x, y, epochs = num_epochs, alpha=alpha)

    predictions_l = mlp_l.forward(x_test)
    predictions_s = mlp_s.forward(x_test)
    predictions_t = mlp_t.forward(x_test)
    predictions_r = mlp_r.forward(x_test)

    results_l.append(metrics.mean_absolute_error(test_df['y'], predictions_l))
    results_s.append(metrics.mean_absolute_error(test_df['y'], predictions_s))
    results_t.append(metrics.mean_absolute_error(test_df['y'], predictions_t))
    results_r.append(metrics.mean_absolute_error(test_df['y'], predictions_r))

    mean_results_l[architecture].append(np.array(results_l).mean())
    mean_results_s[architecture].append(np.array(results_s).mean())
    mean_results_t[architecture].append(np.array(results_t).mean())
    mean_results_r[architecture].append(np.array(results_r).mean())



CPU times: user 14min 15s, sys: 2.15 s, total: 14min 17s
Wall time: 14min 15s


In [82]:
print(f'1 hidden layer  with  8 neurons:\nlinear:{round(mean_results_l["1"][0], 3)},\tsigmoid:{round(mean_results_s["1"][0], 3)},\ttanh:{round(mean_results_t["1"][0], 3)},\trelu:{round(mean_results_r["1"][0], 3)}')
print(f'2 hidden layers with 16 neurons:\nlinear:{round(mean_results_l["2"][0], 3)},\tsigmoid:{round(mean_results_s["2"][0], 3)},\ttanh:{round(mean_results_t["2"][0], 3)},\trelu:{round(mean_results_r["2"][0], 3)}')
print(f'3 hidden layers with 32 neurons:\nlinear:{round(mean_results_l["3"][0], 3)},\tsigmoid:{round(mean_results_s["3"][0], 3)},\ttanh:{round(mean_results_t["3"][0], 3)},\trelu:{round(mean_results_r["3"][0], 3)}')

1 hidden layer  with  8 neurons:
linear:20.025,	sigmoid:4.174,	tanh:3.21,	relu:15.486
2 hidden layers with 16 neurons:
linear:20.019,	sigmoid:1.812,	tanh:0.832,	relu:4.413
3 hidden layers with 32 neurons:
linear:20.016,	sigmoid:0.985,	tanh:0.856,	relu:0.737


In [85]:
%%time
train_df = pd.read_csv('../data/rings3-regular-training.csv', index_col=0).reset_index()
test_df = pd.read_csv('../data/rings3-regular-test.csv', index_col=0).reset_index()

x = np.asarray(train_df.iloc[:,0:2])
x = (x - np.mean(x, axis = 0))/np.std(x, axis = 0)

x_test = np.asarray(test_df.iloc[:,0:2])
x_test = (x_test - np.mean(x_test, axis = 0))/np.std(x_test, axis = 0)

y = np.asarray(train_df.iloc[:,2]).reshape(-1, 1)

mean_results_l = {}
mean_results_s = {}
mean_results_t = {}
mean_results_r = {}


architectures = {
    "1":[2,8,3],
    "2":[2,16,16,3],
    "3":[2,32,32,32,3]
}

alpha = 0.01
num_epochs = 1000

for architecture in ["1","2","3"]:
    mean_results_l[architecture] = []
    mean_results_s[architecture] = []
    mean_results_t[architecture] = []
    mean_results_r[architecture] = []
    
    layers = architectures[architecture]
            
    results_l = []
    results_s = []
    results_t = []
    results_r = []

    weights, biases = generate_weights_and_biases(layers, -1, 1)

    mlp_l = MLP(layers, weights, biases, output_function='softmax', activation = 'linear')
    mlp_s = MLP(layers, weights, biases, output_function='softmax', activation = 'sigmoid')
    mlp_t = MLP(layers, weights, biases, output_function='softmax', activation = 'tanh')
    mlp_r = MLP(layers, weights, biases, output_function='softmax', activation = 'relu')

    mlp_l.train(x, y, epochs = num_epochs, alpha=alpha)
    mlp_s.train(x, y, epochs = num_epochs, alpha=alpha)
    mlp_t.train(x, y, epochs = num_epochs, alpha=alpha)
    mlp_r.train(x, y, epochs = num_epochs, alpha=alpha)

    predictions_l = np.argmax(mlp_l.forward(x_test),axis = 1)
    predictions_s = np.argmax(mlp_s.forward(x_test),axis = 1)
    predictions_t = np.argmax(mlp_t.forward(x_test),axis = 1)
    predictions_r = np.argmax(mlp_r.forward(x_test),axis = 1)

    results_l.append(metrics.accuracy_score(test_df['c'], predictions_l))
    results_s.append(metrics.accuracy_score(test_df['c'], predictions_s))
    results_t.append(metrics.accuracy_score(test_df['c'], predictions_t))
    results_r.append(metrics.accuracy_score(test_df['c'], predictions_r))

    mean_results_l[architecture].append(np.array(results_l).mean())
    mean_results_s[architecture].append(np.array(results_s).mean())
    mean_results_t[architecture].append(np.array(results_t).mean())
    mean_results_r[architecture].append(np.array(results_r).mean())
    
            

CPU times: user 6min 7s, sys: 2.22 s, total: 6min 9s45
Wall time: 6min 2s


In [86]:
print(f'1 hidden layer  with  8 neurons:\nlinear:{round(mean_results_l["1"][0], 3)},\tsigmoid:{round(mean_results_s["1"][0], 3)},\ttanh:{round(mean_results_t["1"][0], 3)},\trelu:{round(mean_results_r["1"][0], 3)}')
print(f'2 hidden layers with 16 neurons:\nlinear:{round(mean_results_l["2"][0], 3)},\tsigmoid:{round(mean_results_s["2"][0], 3)},\ttanh:{round(mean_results_t["2"][0], 3)},\trelu:{round(mean_results_r["2"][0], 3)}')
print(f'3 hidden layers with 32 neurons:\nlinear:{round(mean_results_l["3"][0], 3)},\tsigmoid:{round(mean_results_s["3"][0], 3)},\ttanh:{round(mean_results_t["3"][0], 3)},\trelu:{round(mean_results_r["3"][0], 3)}')

1 hidden layer  with  8 neurons:
linear:0.433,	sigmoid:0.616,	tanh:0.828,	relu:0.862
2 hidden layers with 16 neurons:
linear:0.414,	sigmoid:0.59,	tanh:0.921,	relu:0.915
3 hidden layers with 32 neurons:
linear:0.44,	sigmoid:0.82,	tanh:0.928,	relu:0.938


In [108]:
train_df = pd.read_csv('../data/rings5-regular-training.csv', index_col=0).reset_index()
test_df = pd.read_csv('../data/rings5-regular-test.csv', index_col=0).reset_index()

x = np.asarray(train_df.iloc[:,0:2])
x = (x - np.mean(x, axis = 0))/np.std(x, axis = 0)

x_test = np.asarray(test_df.iloc[:,0:2])
x_test = (x_test - np.mean(x_test, axis = 0))/np.std(x_test, axis = 0)

y = np.asarray(train_df.iloc[:,2]).reshape(-1, 1)

mean_results_l = {}
mean_results_s = {}
mean_results_t = {}
mean_results_r = {}


architectures = {
    "1":[2,8,5],
    "2":[2,16,16,5],
    "3":[2,32,32,32,5]
}

alpha = 0.001
num_epochs = 1000

for architecture in ["1","2","3"]:
    mean_results_l[architecture] = []
    mean_results_s[architecture] = []
    mean_results_t[architecture] = []
    mean_results_r[architecture] = []
    
    layers = architectures[architecture]
            
    results_l = []
    results_s = []
    results_t = []
    results_r = []

    weights, biases = generate_weights_and_biases(layers, -1, 1)

    mlp_l = MLP(layers, weights, biases, output_function='softmax', activation = 'linear', max_class = 4)
    mlp_s = MLP(layers, weights, biases, output_function='softmax', activation = 'sigmoid', max_class = 4)
    mlp_t = MLP(layers, weights, biases, output_function='softmax', activation = 'tanh', max_class = 4)
    mlp_r = MLP(layers, weights, biases, output_function='softmax', activation = 'relu', max_class = 4)

    mlp_l.train(x, y, epochs = num_epochs, alpha=alpha, batch_size = 100)
    mlp_s.train(x, y, epochs = num_epochs, alpha=alpha, batch_size = 100)
    mlp_t.train(x, y, epochs = num_epochs, alpha=alpha, batch_size = 100)
    mlp_r.train(x, y, epochs = num_epochs, alpha=alpha, batch_size = 100)

    predictions_l = np.argmax(mlp_l.forward(x_test),axis = 1)
    predictions_s = np.argmax(mlp_s.forward(x_test),axis = 1)
    predictions_t = np.argmax(mlp_t.forward(x_test),axis = 1)
    predictions_r = np.argmax(mlp_r.forward(x_test),axis = 1)

    results_l.append(metrics.accuracy_score(test_df['c'], predictions_l))
    results_s.append(metrics.accuracy_score(test_df['c'], predictions_s))
    results_t.append(metrics.accuracy_score(test_df['c'], predictions_t))
    results_r.append(metrics.accuracy_score(test_df['c'], predictions_r))

    mean_results_l[architecture].append(np.array(results_l).mean())
    mean_results_s[architecture].append(np.array(results_s).mean())
    mean_results_t[architecture].append(np.array(results_t).mean())
    mean_results_r[architecture].append(np.array(results_r).mean())
    

Cross entropy loss in epoch 999 = 15.034523475532992

In [109]:
print(f'1 hidden layer  with  8 neurons:\nlinear:{round(mean_results_l["1"][0], 3)},\tsigmoid:{round(mean_results_s["1"][0], 3)},\ttanh:{round(mean_results_t["1"][0], 3)},\trelu:{round(mean_results_r["1"][0], 3)}')
print(f'2 hidden layers with 16 neurons:\nlinear:{round(mean_results_l["2"][0], 3)},\tsigmoid:{round(mean_results_s["2"][0], 3)},\ttanh:{round(mean_results_t["2"][0], 3)},\trelu:{round(mean_results_r["2"][0], 3)}')
print(f'3 hidden layers with 32 neurons:\nlinear:{round(mean_results_l["3"][0], 3)},\tsigmoid:{round(mean_results_s["3"][0], 3)},\ttanh:{round(mean_results_t["3"][0], 3)},\trelu:{round(mean_results_r["3"][0], 3)}')

1 hidden layer  with  8 neurons:
linear:0.334,	sigmoid:0.244,	tanh:0.362,	relu:0.214
2 hidden layers with 16 neurons:
linear:0.342,	sigmoid:0.218,	tanh:0.555,	relu:0.35
3 hidden layers with 32 neurons:
linear:0.414,	sigmoid:0.335,	tanh:0.565,	relu:0.449


# Wnioski

Najlepsze funkcje aktywacji to `tanh` i `relu`. Dla większych sieci zazwyczaj lepiej sprawdza się `relu`, aczkolwiek w ostatnim zbiorze, który okazał się problematyczny dla architektur najlepsze był `tanh`. Z tego co zauważyłem podczas treningu, to `relu` zbiegało w bardziej równomiernym tempie, być może skorzystałoby z większej ilości `epoch`.  