In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import expit

In [None]:
def relu(x):
    return np.maximum(0, x)


def der_expit(x=None, act=None):
    if x is None:
        return act*(1-act)
    elif act is None:
        return expit(x)*(1-expit(x))
    else:
        raise ValueError('must include either x or act')

def der_tanh(x = None, act = None):
    if x is None:
        return 1 - act**2
    elif act is None:
        return 1 - np.tanh(x)**2
    else:
        raise ValueError('must include either x or act')


activation_choices = {'sigmoid':expit,'tanh':np.tanh}
act_derivatives = {'sigmoid':der_expit,'tanh':der_tanh}

activation = 'tanh'
act_func = activation_choices[activation]
act_deriv = act_derivatives[activation]

In [None]:
def gen_data(n):
    spins = np.random.choice(np.array([-1,1],dtype=np.int8), size = n)
    return np.einsum('i,j->ij', spins, spins, dtype = np.int8)


L = 40
n = L*L
N = 100

data_input = [gen_data(L) for i in range(N)]
print(data_input[0].size)
data_targets = [np.sum(inp) for inp in data_input]

In [None]:
data_targets

In [None]:
from sklearn.model_selection import train_test_split
train_inp,  test_inp, test_targets, train_targets = train_test_split(data_input,data_targets, test_size = 0.33)
len(train_inp),len(train_targets),len(test_inp),len(test_targets)


In [None]:
def forward(inp):
    outs = []
    activations = []
    z = inp
    act = act_func(inp)

    
    for i,n in enumerate(n_nodes[:-1]):
        z = weights[i] @ np.append(act, 1)  # add a bias
        act = act_func(z)

        outs.append(z)
        activations.append(act)
    return outs, activations
    
N_layers = 3
n_nodes = [20, 10, 5, 1] # input, hidden, ..., hidden, output

small = 1e-2
weights = [small*np.random.random(size = (n,m+1)) for n,m in zip(n_nodes[1:], n_nodes[:-1])]
biases = [small*np.random.random(size = n) for n in n_nodes[1:]]
    
for inp, target in zip(train_inp, train_targets):
    outs,activations = forward(inp)

    L = len(n_nodes)-2
    deltas = [np.zeros(n) for n in n_nodes[1:]]
    # print('hei0', list(map(np.shape,deltas)))
    deltas[L] = act_deriv(outs[-1])*(activations[-1] - target)

    for i in range(1,L+1):
        l = L-i
        deltas[l] = np.einsum('k,kj,j->j', deltas[l+1], weights[l+1][:,:-1],act_deriv(outs[l]))
        
    eta = 1e-2
    for i in range(0,L):
        l = L-i
        weights[l] = weights[l] - eta * np.outer(deltas[l], np.append(activations[l-1], 1))

find last delta:
$$ \delta_j^L = f'(z_j^L) \frac{\partial \mathcal{C}}{\partial a_j^L} $$

propagate deltas for each layer
$$ \delta_j^l = \sum_k \delta_k^{l+1} w_{kj}^{l+1} f'(z_j^l)$$

update weights and bias
$$w_{jk}^l \leftarrow w_{jk}^l - \eta \delta_j^l a_k^{l-1} $$
$$b_{j}^l \leftarrow b_{j}^l - \eta \delta_j^l  $$



we have

$$ \frac{\partial \mathcal{C}}{\partial a_j^L} = a_j^L - t_i, $$
and
$$ f'(z_j^l) = f(z_j^l) * (1 - f(z_j^l) $$

In [None]:
?expit

In [None]:
np.c_