In [140]:
import random
import numpy as np

In [141]:
def sigmoid(z):
    print('z', z)
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

In [168]:
class Network(object):
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = []
        for i in sizes[1:]:
            bias = []
            for j in range(i):
                bias.append(random.normalvariate(0,1)) # using a guassian distribution with mean 0 and variance 1
            self.biases.append(bias)
        self.weights = []
        weight = []
        for x, y in zip(sizes[:-1], sizes[1:]):
            row = []
            for i in range(y):
                col = []
                for j in range(x):
                    col.append(random.normalvariate(0, 1))
                row.append(col)
            self.weights.append(row)
            
    def feedforward(self, a):
        for b, w in zip(self.biases, self.weight):
            # load storage for activation
            activations = []
            for i in range(len(b)):
                activation = []
                for j in range(len(a[0][:])):
                    activation.append(0)
                activations.append(activation)
            
            # 
            for i, w_col in enumerate(w):
                for a_col in range(len(a[0][:])):
                    plus_sum = 0
                    for a_row in range(len(w_col)):
                        plus_sum += w_col[a_row]*a[a_row][a_col]
                        
                    activations[i][a_col] += plus_sum
                    
            for i in range(len(activations[:])):
                for j in range(len(activations[0][:])):
                    activations[i][j] = sigmoid(activations[i][j])
                    
            return activations
    
    def SGD(self, training_data, epoches, mini_batch_size, eta):
        for j in range(epoches):
            random.shuffle(training_data)
            for mini_batch in training_data:
                self.update_mini_batch(mini_batch, eta)
                print("Epoch {0} complete".format(j))

    def update_mini_batch(self, mini_batch, eta):
        
        print('mini_batch:', mini_batch)
        
        # load storage for nabla_b nabla_w
        nabla_b = []
        nabla_w = []
        for b in self.biases:
            bias = []
            for i in range(len(b)):
                bias.append(0)
            nabla_b.append(bias)
        for w in self.weights:
            row = []
            for w_row in range(len(w)):
                col = []
                for w_col in range(len(w[0])):
                    col.append(0)
                row.append(col)
            nabla_w.append(row)
            
        # bp compute 
        x = mini_batch[0]
        y = mini_batch[1]
        
        print('update mini batch nabla_b', nabla_b)
        print('update mini batch nabla_w', nabla_w)
        delta_nabla_b, delta_nabla_w = self.backprop(x, y)
        for i, b in enumerate(delta_nabla_b):
            for j in range(len(b)):
                nabla_b[i][j] += delta_nabla_b[i][j]
                    
        for i, delta_w in enumerate(delta_nabla_w):
            for j in range(len(delta_w)):
                for k in range(len(delta_w[0])):
                    nabla_w[i][j][k] += delta_w[j][k]
        
        # update w and b
        for i, b in enumerate(delta_nabla_b):
            for j in range(len(b)):
                self.biases[i][j] = self.biases[i][j] - (eta)*delta_nabla_b[i][j]
                
        for i, delta_w in enumerate(delta_nabla_w):
            for j in range(len(delta_w)):
                for k in range(len(delta_w[0])):
                    self.weights[i][j][k] = self.weights[i][j][k] - (eta)*delta_nabla_w[i][j][k]
        
    
    def backprop(self, x, y):
        # load storage for nabla_b nabla_w
        
        print('x', x)
        print('y', y)
        
        nabla_b = []
        nabla_w = []
        for b in self.biases:
            bias = []
            for i in range(len(b)):
                bias.append(0)
            nabla_b.append(bias)
        for w in self.weights:
            row = []
            for w_row in range(len(w)):
                col = []
                for w_col in range(len(w[0])):
                    col.append(0)
                row.append(col)
            nabla_w.append(row)
        
        print('nabla_b:', nabla_b)
        print('nabla_w:', nabla_w)
        
        
        # feedforward
        activation = x # 
        activations = [x] # 
        zs = [] # all of z list
        
        print('activation', activation)
        print('activations', activations)
        print('zs', zs)
        
        for b, w in zip(self.biases, self.weights):
            z = []
            acti = []
            for w_row in range(len(w)):
                z_sum = 0
                for w_col in range(len(w[0])):
                    z_sum += w[w_row][w_col]*activation[w_col]  
                    
                acti.append(sigmoid(z_sum))
                z.append(z_sum)
            activations.append(acti)
            activation = acti
            zs.append(z)
        
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1][0])
        nabla_b[-1][0] = delta
        for i in range(len(activations[-2])):
            print('delta:', delta)
            print('activations', activations[-2][i])
            nabla_w[-1][0][i] = delta*activations[-2][i]
            
        print('activations:',activations)
        print('zs:', zs)
        
        activation = activations[0]
        z = zs[0]
        
        for w_row in range(len(nabla_w[0])):
            sp = sigmoid_prime(z[w_row])
            delta = delta*w[-1][w_row]*sp
            nabla_b[0][w_row] = delta
            for w_col in range(len(nabla_w[0][0])):
                nabla_w[0][w_row][w_col] = delta * activation[w_col]
        
        print('nabla_b', nabla_b)
        print('nabla_w', nabla_w)
        
        return nabla_b,nabla_w        
        
        
    def cost_derivative(self, output_activations, y):
        print(output_activations, y)
        return output_activations[0] - y[0]
            
        

In [169]:
network = Network([2,3,1])
print('num_lyers', network.num_layers)
print('sizes', network.sizes)
print('biases', network.biases)
print('weight', network.weights)

num_lyers 3
sizes [2, 3, 1]
biases [[0.6393612066450781, -1.2260215145435769, -0.16736575038717155], [0.05502540330479216]]
weight [[[-0.7549925509029479, 0.9508715882739265], [-0.5837652477495254, -0.8250155845426961], [-2.400216839962009, -0.2769280655639904]], [[-1.3990625734394588, -0.3693009708193786, -0.03230296079620962]]]


In [170]:
network.SGD([[[2,3], [1]], [[1,2], [5]], [[2,1], [4]]], 10, 2, 0.0001)

mini_batch: [[2, 1], [4]]
update mini batch nabla_b [[0, 0, 0], [0]]
update mini batch nabla_w [[[0, 0], [0, 0], [0, 0]], [[0, 0, 0]]]
x [2, 1]
y [4]
nabla_b: [[0, 0, 0], [0]]
nabla_w: [[[0, 0], [0, 0], [0, 0]], [[0, 0, 0]]]
activation [2, 1]
activations [[2, 1]]
zs []
z -0.5591135135319694
z -1.992546080041747
z -5.077361745488009
z -0.553424449116635
[0.36507027617373344] [4]
z -0.553424449116635
z -0.553424449116635
delta: -0.8425547900051086
activations 0.363752600382779
delta: -0.8425547900051086
activations 0.11998775987804186
delta: -0.8425547900051086
activations 0.006197689483288214
activations: [[2, 1], [0.363752600382779, 0.11998775987804186, 0.006197689483288214], [0.36507027617373344]]
zs: [[-0.5591135135319694, -1.992546080041747, -5.077361745488009], [-0.553424449116635]]
z -0.5591135135319694
z -0.5591135135319694
z -1.992546080041747
z -1.992546080041747
z -5.077361745488009
z -5.077361745488009
nabla_b [[0.27281448029730687, -0.010638331649029236, 2.1166335278965665e-