In [None]:
import numpy as np
import random
from typing import NoReturn

class Network(object):

    def __init__(self, sizes: list) -> NoReturn:
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]

    
    def feedforward(self, a: np.ndarray) -> np.ndarray:
        for b, w in zip(self.biases, self.weights):
            a = self.sigmoid(np.dot(w, a)+b)
        return a

    
    # Função de Ativação Sigmóide
    def sigmoid(z: np.ndarray) -> np.ndarray:
        return 1.0/(1.0+np.exp(-z))
    

    def SGD(self, training_data: list, epochs: int, mini_batch_size: int, eta: float, test_data: list = None) -> NoReturn:
        if test_data: n_test = len(test_data)
        n = len(training_data)

        if test_data:
            test_data = list(test_data)
            n_test = len(test_data)

        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print(f"Epoch {j}: {self.evaluate(test_data)} / {n_test}")
            else:
                print(f"Epoch {j} complete")


    def backprop(
            self,
            x: np.ndarray,
            y: np.ndarray
    ) -> tuple:
        """
        Retorna uma tupla (nabla_b, nabla_w) representando o
        gradiente para a função de custo C_x. nabla_b e nabla_w
        são listas de arrays numpy, semelhantes a self.biases
        e self.weights.

        Args:
            x (np.ndarray): Vetor de entrada
            y (np.ndarray): Vetor de saída
        
        Returns:
            tuple: Tupla contendo os vetores de bias e pesos.
        """
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        # Feedforward
        activation = x

        # Lista para armazenar todas as ativações, camada por camada
        activations = [x]

        # Lista para armazenar todos os vetores z, camada por camada
        zs = []

        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)

            activation = self.sigmoid(z)
            activations.append(activation)

        # Backward pass
        delta = self.cost_derivative(activations[-1], y) * self.sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

        # Aqui, l = 1 significa a última camada de neurônios, l = 2 é a segunda e assim por diante.
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = self.sigmoid_prime(z)

            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        
        return (nabla_b, nabla_w)

    
    def update_mini_batch(self, mini_batch, eta):
        nabla_b = [np.zeros(b.shape) for b in self.biases] # Inicializa os vetores de bias
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] # Atualiza os vetores de bias
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        
        self.weights = [w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]