In [229]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [508]:
data = np.loadtxt(open('cercles.txt','r'))
x = np.matrix(data[:,:-1]).T # here x is row vector, n*d, so w1x+b （dh*d*d*1+dh*1）-->> x*w1T+b(1*d*d*dh+1*dh)
y = np.matrix(data[:,-1])

In [333]:
print(np.shape(x))
print(np.shape(x[:,0]))
print(np.shape(y))

(2, 1100)
(2, 1)
(1, 1100)


## 1. As a beginning, start with an implementation that computes the gradients for a single example, and check that the gradient is correct using the finite difference method described above.

### onehot, rect, softmax

In [465]:
def onehot(m,y): # y can not be a number
    result = []
    y = y.T
    for i in range(len(y)):
        result.append(np.eye(m)[int(y[i])])
    result = np.matrix(result).T
    return result

def rect(x): # x must be a matrix
    result=[]
    for i in range(np.shape(x)[1]):
        temp=[]
        for j in range(np.shape(x)[0]):
            temp.append(max(0,x[j,i]))
        result.append(temp)
    result = np.matrix(result).T
    return result

def softmax(x): # x must be a matrix
    result = []
    m,n = np.shape(x)
    for i in range(n):
        temp = []
        sum = np.sum(np.exp(x[:,i]))
#        print(sum)
        for j in range(m):
            temp.append(np.exp(x[j,i])/sum)
        result.append(temp)
    result = np.matrix(result).T
    return result 

 

### w and b

In [335]:
#initial the parameters

dh = 2
d = 2
m = 2

w1 = np.matrix((np.random.uniform(-1/d**0.5,1/d**0.5,d*dh)).reshape(dh,d))
print('w1:\n',w1)
b1 = np.matrix([0]*dh).T
w2 = np.matrix((np.random.uniform(-1/dh*0.5,1/dh*0.5,dh*m)).reshape(m,dh))
print('w2:\n',w2)
b2 = np.matrix([0]*m).T

w1:
 [[-0.67578067 -0.25028273]
 [ 0.58216531 -0.52019275]]
w2:
 [[-0.10792255 -0.04123022]
 [ 0.03137159 -0.23308715]]


### fprop

In [653]:
#calculate x,ha,hs,oa,os

def fprop(x,y,w1,b1,w2,b2):
#    mean = np.mean(x)
#    s = np.std(x)
#    x = (x-mean)/s
    ha = w1*x+b1
#    print(w1,'\n',x[:,0])
#    print('ha:\n',np.shape(ha))
    hs = rect(ha)
#    print('hs:\n',np.shape(hs))
    oa = w2*hs+b2
#    print('oa:\n',np.shape(oa))
    os = softmax(oa)
#    print('os:\n',np.shape(os))
    L = -np.log(os[int(y)])
    return(ha,hs,oa,os,L)



In [665]:
ha,hs,oa,os,L=fprop(x[:,0],y[0,0],w1,b1,w2,b2)
print('ha:',ha,'\nhs:',hs,'\noa:',oa,'\nos:',os,'\nlabel:',y[0,0],'\nL:',L)

ha: [[-0.57558934]
 [ 0.26053306]] 
hs: [[0.        ]
 [0.26053306]] 
oa: [[-0.01074184]
 [-0.06072691]] 
os: [[0.51249367]
 [0.48750633]] 
label: 1.0 
L: [[0.718452]]


### bprop

In [844]:
def bprop(m,x,y,w1,b1,w2,b2):
    ha = fprop(x,y,w1,b1,w2,b2)[0]
    hs = fprop(x,y,w1,b1,w2,b2)[1]
    os = fprop(x,y,w1,b1,w2,b2)[3]
    grad_oa = os-onehot(m,y)
    grad_w2 = grad_oa*hs.T
    grad_b2 = grad_oa
    grad_hs = w2.T*grad_oa
    grad_ha=[]
    for i in range(dh):
        if (ha[i,0] <= 0) : grad_ha.append(0)
        else: grad_ha.append(grad_hs[i,0])
    grad_ha = np.matrix(grad_ha).T
    grad_w1 = grad_ha*x.T 
    grad_b1 = grad_ha
    return(grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1, grad_b1)



In [845]:
[goa,gw2,gb2,ghs,gha,gw1,gb1] = bprop(m,x[:,0],np.matrix(y[0,0]),w1,b1,w2,b2)
print('goa:',goa,'\ngw2:',gw2,'\ngb2:',gb2,'\ngw1:',gw1,'\ngb1:',gb1,'\ngha:',gha,'\nghs:',ghs)

goa: [[ 0.51249367]
 [-0.51249367]] 
gw2: [[ 0.          0.13352154]
 [-0.         -0.13352154]] 
gb2: [[ 0.51249367]
 [-0.51249367]] 
gw1: [[0.         0.        ]
 [0.0721014  0.03144585]] 
gb1: [[0.        ]
 [0.09832546]] 
gha: [[0.        ]
 [0.09832546]] 
ghs: [[-0.07138737]
 [ 0.09832546]]



## 2. Display the gradients for both methods (direct computation and finite difference) for a small network (e.g. d = 2 and dh = 2) with random weights and for a single example.

In [709]:
# verify for w2[0,0] 
# dL/d(w2[0,0])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[1][0,0]
# (fprop(x[:,0],y[0,0],w1,b1,w2',b2)-fprop(x[:,0],y[0,0],w1,b1,w2,b22))/epsilon where w2'=w2+[[epsilon,0],[0,0]]
epsilon = (10)**(-5)
w2_new = np.zeros([2,2])
w2_new[0,0] = epsilon
w2_new += w2
diff_gw2 = (fprop(x[:,0],y[0,0],w1,b1,w2_new,b2)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_gw2 == gw2[0,0])

[[ True]]


In [718]:
# verify for w2[0,1] 
# dL/d(w2[0,1])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[1][0,1]
# (fprop(x[:,0],y[0,0],w1,b1,w2',b2)-fprop(x[:,0],y[0,0],w1,b1,w2,b2))/epsilon where w2'=w2+[[0,epsilon],[0,0]]
w2_new = np.zeros([2,2])
w2_new[0,1] = epsilon
w2_new += w2
diff_gw2 = (fprop(x[:,0],y[0,0],w1,b1,w2_new,b2)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_gw2/gw2[0,1] )

[[1.00000064]]


In [670]:
# verify for w2[1,0] 
# dL/d(w2[1,0])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[1][1,0]
# (fprop(x[:,0],y[0,0],w1,b1,w2',b2)-fprop(x,w1,b1,w2,b2))/epsilon where w2'=w2+[[0,0],[epsilon,0]]
w2_new = np.zeros([2,2])
w2_new[1,0] = epsilon
w2_new += w2
diff_gw2 = (fprop(x[:,0],y[0,0],w1,b1,w2_new,b2)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_gw2 == gw2[1,0] )

[[ True]]


In [671]:
# verify for w2[1,1] 
# dL/d(w2[1,1])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[1][1,1]
# (fprop(x[:,0],y[0,0],w1,b1,w2',b2)-fprop(x[:,0],y[0,0],w1,b1,w2,b2))/epsilon where w2'=w2+[[0,0],[0,epsilon]]
w2_new = np.zeros([2,2])
w2_new[1,1] = epsilon
w2_new += w2
diff_gw2 = (fprop(x[:,0],y[0,0],w1,b1,w2_new,b2)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_gw2 / gw2[1,1] )

[[0.99999937]]


In [672]:
# verify for b2[0,0] 
# dL/d(b2[0,0])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[2][0,0]
# (fprop(x[:,0],y[0,0],w1,b1,w2,b2')-fprop(x,w1,b1,w2,b2))/epsilon where b2'=b2+[epsilon,0]
b2_new = np.zeros([1,2]).T
b2_new[0,0] = epsilon
b2_new += b2
diff_b2 = (fprop(x[:,0],y[0,0],w1,b1,w2,b2_new)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_b2 / gb2[0,0] )

[[1.00000244]]


In [673]:
# verify for b2[1,0] 
# dL/d(b2[1,0])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[2][1,0]
# (fprop(x[:,0],y[0,0],w1,b1,w2,b2')-fprop(x[:,0],y[0,0],w1,b1,w2,b2))/epsilon where b2'=b2+[0,epsilon]
b2_new = np.zeros([1,2]).T
b2_new[1,0] = epsilon
b2_new += b2
diff_b2 = (fprop(x[:,0],y[0,0],w1,b1,w2,b2_new)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_b2 / gb2[1,0] )

[[0.99999756]]


In [816]:
# verify for w1[0,0] 
# dL/d(w2[0,0])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[3][0,0]
# (fprop(x[:,0],y[0,0],w1',b1,w2,b2)-fprop(x[:,0],y[0,0],w1,b1,w2,b2))/epsilon where w1'=w1+[[epsilon,0],[0,0]]
epsilon = (10)**(-5)
w1_new = np.zeros([2,2])
w1_new[0,0] = epsilon
w1_new += w1
diff_gw1 = (fprop(x[:,0],y[0,0],w1_new,b1,w2,b2)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_gw1 == gw1[0,0] )

[[ True]]


In [817]:
# verify for w1[0,1] 
# dL/d(w1[0,1])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[3][0,1]
# (fprop(x[:,0],y[0,0],w1',b1,w2,b2)-fprop(x[:,0],y[0,0],w1,b1,w2,b2))/epsilon where w1'=w1+[[0,epsilon],[0,0]]
epsilon = (10)**(-5)
w1_new = np.zeros([2,2])
w1_new[0,1] = epsilon
w1_new += w1
diff_gw1 = (fprop(x[:,0],y[0,0],w1_new,b1,w2,b2)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_gw1 == gw1[0,1] )

[[ True]]


In [818]:
# verify for w1[1,0] 
# dL/d(w1[1,0])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[3][1,0]
# (fprop(x[:,0],y[0,0],w1',b1,w2,b2)-fprop(x[:,0],y[0,0],w1,b1,w2,b2))/epsilon where w1'=w1+[[0,0],[epsilon,0]]
epsilon = (10)**(-5)
w1_new = np.zeros([2,2])
w1_new[1,0] = epsilon
w1_new += w1
diff_gw1 = (fprop(x[:,0],y[0,0],w1_new,b1,w2,b2)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_gw1 / gw1[1,0] )

[[1.00000034]]


In [819]:
# verify for w1[1,1] 
# dL/d(w1[1,1])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[3][1,1]
# (fprop(x[:,0],y[0,0],w1',b1,w2,b2)-fprop(x[:,0],y[0,0],w1,b1,w2,b2))/epsilon where w1'=w1+[[0,0],[0,epsilon]]
epsilon = (10)**(-5)
w1_new = np.zeros([2,2])
w1_new[1,1] = epsilon
w1_new += w1
diff_gw1 = (fprop(x[:,0],y[0,0],w1_new,b1,w2,b2)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_gw1 / gw1[1,1] )

[[1.00000015]]


In [820]:
# verify for b1[0,0] 
# dL/d(b1[0,0])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[4][0,0]
# (fprop(x[:,0],y[0,0],w1,b1',w2,b2)-fprop(x[:,0],y[0,0],w1,b1,w2,b2))/epsilon where b1'=b1+[epsilon,0]
b1_new = np.zeros([1,2]).T
b1_new[0,0] = epsilon
b1_new += b1
diff_b1 = (fprop(x[:,0],y[0,0],w1,b1_new,w2,b2)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_b1 == gb1[0,0])

[[ True]]


In [821]:
# verify for b1[1,0] 
# dL/d(b1[0,0])=bprop(m,x[:,0],np.matrix(y[0,0]),os)[4][1,0]
# (fprop(x[:,0],y[0,0],w1,b1',w2,b2)-fprop(x[:,0],y[0,0],w1,b1,w2,b2))/epsilon where b1'=b1+[0,epsilon]
b1_new = np.zeros([1,2]).T
b1_new[1,0] = epsilon
b1_new += b1
diff_b1 = (fprop(x[:,0],y[0,0],w1,b1_new,w2,b2)[-1]-fprop(x[:,0],y[0,0],w1,b1,w2,b2)[-1])/epsilon
print (diff_b1 / gb1[1,0])

[[1.00000047]]


## 3. Add a hyperparameter for the minibatch size K to allow compute the gradients on a minibatch of K examples (in a matrix), by looping over the K examples (this is a small addition to your previous code).


In [848]:
'''
# fprop in part3.1

def fprop(x,y,w1,b1,w2,b2):

    ha = w1*x+b1
    hs = rect(ha)
    oa = w2*hs+b2
    os = softmax(oa)
    L = -np.log(os[int(y)])
    return(ha,hs,oa,os,L)
'''

def fprop_k_loop(x,y,w1,b1,w2,b2,k):
    sum_L = 0
    for i in k:
        sum_L += fprop(x[:,i],y[0,i],w1,b1,w2,b2)[-1]  
    return np.linalg.det(sum_L/len(k))

'''
# bprop in part3.1

def bprop(m,x,y,w1,b1,w2,b2):
    ha = fprop(x,y,w1,b1,w2,b2)[0]
    hs = fprop(x,y,w1,b1,w2,b2)[1]
    os = fprop(x,y,w1,b1,w2,b2)[3]
    grad_oa = os-onehot(m,y)
    grad_w2 = grad_oa*hs.T
    grad_b2 = grad_oa
    grad_hs = w2.T*grad_oa
    grad_ha=[]
    for i in range(dh):
        if (ha[i,0] <= 0) : grad_ha.append(0)
        else: grad_ha.append(grad_hs[i,0])
    grad_ha = np.matrix(grad_ha).T
    grad_w1 = grad_ha*x.T 
    grad_b1 = grad_ha
    return(grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1, grad_b1)



'''


def bprop_k_loop(m,x,y,w1,b1,w2,b2,k,dh):
    grad_oa = np.zeros([m,1])
    grad_w2 = np.zeros([m,dh])
    grad_b2 = np.zeros([m,1])
    grad_hs = np.zeros([dh,1])
    grad_ha = np.zeros([dh,1])
    grad_w1 = np.zeros([dh,d])
    grad_b1 = np.zeros([dh,1])
    for i in k :
        [grad_oai, grad_w2i, grad_b2i, grad_hsi, grad_hai, grad_w1i, grad_b1i]= bprop(m,x[:,i],np.matrix(y[0,i]),w1,b1,w2,b2)
        grad_oa += grad_oai
        grad_w2 += grad_w2i
        grad_b2 += grad_b2i
        grad_hs += grad_hsi
        grad_ha += grad_hai
        grad_w1 += grad_w1i
        grad_b1 += grad_b1i       
    grad_oa /= len(k)
    grad_w2 /= len(k)
    grad_b2 /= len(k)
    grad_hs /= len(k)
    grad_ha /= len(k)
    grad_w1 /= len(k)
    grad_b1 /= len(k)
    return (grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1,grad_b1)




## 4. Display the gradients for both methods (direct computation and finite difference) for a small network (e.g. d = 2 and dh = 2) with random weights and for a minibatch with 10 examples (you can use examples from both classes from the two circles dataset).

In [858]:
# verify for each parameter on a minibatch k = [0,1,2,...,10]
k=np.arange(10)
grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1,grad_b1 = bprop_k_loop(m,x,y,w1,b1,w2,b2,k,dh)
print('grad_oa:\n',grad_oa,'\ngrad_w2:\n',grad_w2,'\ngrad_b2:\n',grad_b2,'\ngrad_w1:\n',grad_w1,'\ngrad_b1:\n',grad_b1)

grad_oa:
 [[-0.19718862]
 [ 0.19718862]] 
grad_w2:
 [[-0.09208699 -0.0628827 ]
 [ 0.09208699  0.0628827 ]] 
grad_b2:
 [[-0.19718862]
 [ 0.19718862]] 
grad_w1:
 [[-0.01890475 -0.00020662]
 [-0.02551785 -0.00536556]] 
grad_b1:
 [[ 0.02175094]
 [-0.02705901]]


In [859]:
# verify for w2[0,0]
# dL/d(w2[0,0])= grad_w2[0,0] we have got it in part3.3
# (fprop_k_loop(x,y,w1,b1,w2',b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where w2'=w2+[[epsilon,0][0,0]]
epsilon = (10)**(-5)
w2_new = np.zeros([2,2])
w2_new[0,0] = epsilon
w2_new += w2
diff_gw2 = (fprop_k_loop(x,y,w1,b1,w2_new,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_gw2 / grad_w2[0,0])

0.9999985538216867


In [860]:
# verify for w2[0,1]
# dL/d(w2[0,1])= grad_w2[0,1] we have got it in part3.3
# (fprop_k_loop(x,y,w1,b1,w2',b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where w2'=w2+[[0,epsilon],[0,0]]
epsilon = (10)**(-5)
w2_new = np.zeros([2,2])
w2_new[0,1] = epsilon
w2_new += w2
diff_gw2 = (fprop_k_loop(x,y,w1,b1,w2_new,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_gw2 / grad_w2[0,1])

0.9999981057717863


In [861]:
# verify for w2[1,0]
# dL/d(w2[1,0])= grad_w2[1,0] we have got it in part3.3
# (fprop_k_loop(x,y,w1,b1,w2',b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where w2'=w2+[[0,0],[epsilon,0]
epsilon = (10)**(-5)
w2_new = np.zeros([2,2])
w2_new[1,0] = epsilon
w2_new += w2
diff_gw2 = (fprop_k_loop(x,y,w1,b1,w2_new,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_gw2 / grad_w2[1,0])

1.0000014461139695


In [862]:
# verify for w2[1,1]
# dL/d(w2[1,1])= grad_w2[0,1] we have got it in part3.3
# (fprop_k_loop(x,y,w1,b1,w2',b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where w2'=w2+[[0,0],[0,epsilon]]
epsilon = (10)**(-5)
w2_new = np.zeros([2,2])
w2_new[1,1] = epsilon
w2_new += w2
diff_gw2 = (fprop_k_loop(x,y,w1,b1,w2_new,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_gw2 / grad_w2[1,1])

1.000001894280663


In [863]:
# verify for b2[0,0]
# dL/d(b2[0,0])= grad_b2[0,0] we have got it in part3.3
# (fprop_k_loop(x,y,w1,b1,w2,b2',k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where b2'=b2+[[epsilon],[0]]
epsilon = (10)**(-5)
b2_new = np.zeros([2,1])
b2_new[0,0] = epsilon
b2_new += b2
diff_b2 = (fprop_k_loop(x,y,w1,b1,w2,b2_new,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_b2 / grad_b2[0,0])

0.9999936697076376


In [864]:
# verify for b2[1,0]
# dL/d(b2[1,0])= grad_b2[1,0] we have got it in part3.3
# (fprop_k_loop(x,y,w1,b1,w2,b2',k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where b2'=b2+[[0],[epsilon]]
epsilon = (10)**(-5)
b2_new = np.zeros([2,1])
b2_new[1,0] = epsilon
b2_new += b2
diff_b2 = (fprop_k_loop(x,y,w1,b1,w2,b2_new,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_b2 / grad_b2[1,0])

1.0000063303590017


In [865]:
# verify for w1[0,0]
# dL/d(w1[0,0])= grad_w1[0,0] we have got it in part3.3
# (fprop_k_loop(x,y,w1',b1,w2,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where w1'=w1+[[epsilon,0],[0,0]]
epsilon = (10)**(-5)
w1_new = np.zeros([2,2])
w1_new[0,0] = epsilon
w1_new += w1
diff_gw1 = (fprop_k_loop(x,y,w1_new,b1,w2,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_gw1 / grad_w1[0,0])

0.9999997059476512


In [866]:
# verify for w1[0,1]
# dL/d(w1[0,1])= grad_w1[0,1] we have got it in part3.3
# (fprop_k_loop(x,y,w1',b1,w2,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where w1'=w1+[[0,epsilon],[0,0]]
epsilon = (10)**(-5)
w1_new = np.zeros([2,2])
w1_new[0,1] = epsilon
w1_new += w1
diff_gw1 = (fprop_k_loop(x,y,w1_new,b1,w2,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_gw1 / grad_w1[0,1])

0.9999916654829426


In [868]:
# verify for w1[1,0]
# dL/d(w1[1,0])= grad_w1[1,0] we have got it in part3.3
# (fprop_k_loop(x,y,w1',b1,w2,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where w1'=w1+[[0,0][epsilon,0]]
epsilon = (10)**(-5)
w1_new = np.zeros([2,2])
w1_new[1,0] = epsilon
w1_new += w1
diff_gw1 = (fprop_k_loop(x,y,w1_new,b1,w2,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_gw1 / grad_w1[1,0])

0.9999993291806428


In [869]:
# verify for w2[1,1]
# dL/d(w2[1,1])= grad_w2[1,1] we have got it in part3.3
# (fprop_k_loop(x,y,w1,b1,w2',b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where w2'=w2+[[0,0][0,epsilon]]
epsilon = (10)**(-5)
w1_new = np.zeros([2,2])
w1_new[1,1] = epsilon
w1_new += w1
diff_gw1 = (fprop_k_loop(x,y,w1_new,b1,w2,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_gw1 / grad_w1[1,1])

0.9999992191666146


In [870]:
# verify for b1[0,0]
# dL/d(b1[0,0])= grad_b1[0,0] we have got it in part3.3
# (fprop_k_loop(x,y,w1,b1',w2,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where b1'=b1+[[epsilon],[0]]
epsilon = (10)**(-5)
b1_new = np.zeros([2,1])
b1_new[0,0] = epsilon
b1_new += b1
diff_b1 = (fprop_k_loop(x,y,w1,b1_new,w2,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_b1 / grad_b1[0,0])

1.0000003333662255


In [871]:
# verify for b1[1,0]
# dL/d(b1[1,0])= grad_b1[1,0] we have got it in part3.3
# (fprop_k_loop(x,y,w1,b1',w2,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon where b1'=b1+[[0],[epsilon]]
epsilon = (10)**(-5)
b1_new = np.zeros([2,1])
b1_new[1,0] = epsilon
b1_new += b1
diff_b1 = (fprop_k_loop(x,y,w1,b1_new,w2,b2,k)-fprop_k_loop(x,y,w1,b1,w2,b2,k))/epsilon
print (diff_b1 / grad_b1[1,0])

0.9999991512086401


## 5. Train your neural network using gradient descent on the two circles dataset. Plot the decision regions for several different values of the hyperparameters (weight decay, number of hidden units, early stopping) so as to illustrate their effect on the capacity of the model.

In [911]:
n = np.shape(x)[1]
inds = np.arange(n)
np.random.shuffle(inds)
train_inds = inds[:800]
test_inds = inds[800:]
x_train = x[:,train_inds]
y_train = y[0,train_inds]
x_test = x[:, test_inds]
y_test = y[0, test_inds]

In [1185]:
def L_theta(lamda,w1,w2,dh,m,d):
    if len(lamda) != 4 : return "Lamda is error!"
    else:
        w1=w1.reshape(1,dh*d)
        w11 = np.sum(abs(w1))
        w12 = np.linalg.det(w1*w1.T)
        w2=w2.reshape(1,dh*m)
        w21 = np.sum(abs(w2))
        w22 = np.linalg.det(w2*w2.T)
    W = [w11,w12,w21,w22]
    return np.linalg.det(lamda*np.matrix(W).T)

def train(x,y,w1,b1,w2,b2,num,lamda,dh,d,m,eta):
    m, n = np.shape(x)
    sum_loss = 0
    Rmin = 10000
    R = 0
    itera = 0
    while(itera < 100):
        for i in range(int(n/num)):
            k = np.arange(i*num,(i+1)*num)
            sum_loss = fprop_k_loop(x,y,w1,b1,w2,b2,k)    
        R = (1/(n/num))*sum_loss+L_theta(lamda,w1,w2,dh,m,d)
        if R < Rmin:
            Rmin = R
            w1min = w1
            w2min = w2
            b1min = b1
            b2min = b2
            for i in range(int(n/num)):
                k = np.arange(i*num,(i+1)*num)
                grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1,grad_b1 = bprop_k_loop(m,x,y,w1,b1,w2,b2,k,dh)
                w1 -= eta*grad_w1
                b1 -= eta*grad_b1
                w2 -= eta*grad_w2
                b2 -= eta*grad_b2
            itera += 1
        else: 
            break
    if itera == 10000 :
        print(' not finish yet...')
    return (Rmin, w1min, w2min, b1min, b2min)

In [1186]:
b1 = np.matrix([0.0]*dh).T
b2 = np.matrix([0.0]*dh).T
Rmin, w1min, w2min, b1min, b2min = train(x_train,y_train,w1,b1,w2,b2,20,lamda,dh,d,m,0.01)

In [1187]:
print(Rmin, w1min, w2min, b1min, b2min)

2.8184499918000228 [[-0.11729274  0.31165869]
 [-0.70208503 -0.27719362]] [[-0.15544452 -0.14049621]
 [-0.17804743 -0.14055413]] [[-0.00015906]
 [-0.00185901]] [[ 0.0129351]
 [-0.0129351]]


## 6. As a second step, copy your existing implementation to modify it to a new implementation that will use matrix calculus (instead of a loop) on batches of size K to improve efficiency. Take the matrix expressions in numpy derived in the first part, and adapt them for a minibatch of size K. Show in your report what you have modified (describe the former and new expressions with the shapes of each matrices).

In [945]:
def prol_k(x,k): #x is a matrix of 1 row, then prolong x to a matrix of k row, each row equal to x
    result=[]
    for i in range(k):
        temp=[]
        for j in range(np.shape(x)[1]):  
            temp.append(x[0,j])
        result.append(temp)
    result=np.matrix(result)
    return result

In [948]:
prol_k(hs.T,10) # hs.T  1*2 -->> 10*2 
print(np.shape(b1.T)[1])
print('b1:',b1)
b1_m = prol_k(b1.T,10).T # b d*1 -->> d*k
b1_m.T

2
b1: [[-1.75631058e-07]
 [ 3.46704925e-05]]


matrix([[-1.75631058e-07,  3.46704925e-05],
        [-1.75631058e-07,  3.46704925e-05],
        [-1.75631058e-07,  3.46704925e-05],
        [-1.75631058e-07,  3.46704925e-05],
        [-1.75631058e-07,  3.46704925e-05],
        [-1.75631058e-07,  3.46704925e-05],
        [-1.75631058e-07,  3.46704925e-05],
        [-1.75631058e-07,  3.46704925e-05],
        [-1.75631058e-07,  3.46704925e-05],
        [-1.75631058e-07,  3.46704925e-05]])

In [1018]:
# calculate x,ha,hs,oa,os,l,l_sum in matrix

# fprop in part3.1
'''
def fprop(x,y,w1,b1,w2,b2):

    ha = w1*x+b1
    hs = rect(ha)
    oa = w2*hs+b2
    os = softmax(oa)
    L = -np.log(os[int(y)])
    return(ha,hs,oa,os,L)
'''

def fprop_k_m(x,y,w1,b1,w2,b2,k,i): # add k and i: k is the size of minibatch, and i is the index of starting point.
    b1_m = prol_k(b1.T,k).T # change b1 from dh*1 to dh*k
    ha = w1*x[:,i:i+k]+b1 # x from a vectot of d*1 change to a matrix of d*k, so b1 must change to dh*k to match
    hs = rect(ha)
    b2 = prol_k(b2.T,k).T #change b2 from m*1 to m*k
    oa = w2*hs+b2 # hs from a vectot of dh*1 change to a matrix of dh*k, so b2 must change to m*k to match
    os = softmax(oa)
    lable = []  # lable and temp are used for choosing the value of os to calculate loss
    temp = []
    for j in range(k):
        lable.append(int(y[0,i+j]))
        temp.append(j)
    L = -np.log(os[lable,temp])
    L_sum = np.sum(L)/n

    return(ha, hs, oa, os, L, L_sum)





##### the shapes of each matrices

In [1036]:
# Here, b1 is dh * k, w1 is dh * d, b2 is m * k, w2 is m * dh, ha and hs are dh * k, oa and os are m * k

In [1063]:
ha,hs,oa,os,L,L_sum = fprop_k_m(x,y,w1,b1,w2,b2,10,1)
print('ha:',ha,'\nhs:',hs,'\noa:',oa,'\nos:',os,'\nlabel:',y[0,0],'\nL:',L,'\nL_sum:',L_sum)


ha: [[-0.16624397 -0.19824557 -0.24601582  0.16635281 -0.25053435  0.23020405
  -0.16478758 -0.25627299  0.24394943  0.08339416]
 [-0.03686463  0.26330663  0.04083425 -0.27705736  0.19761423 -0.23473652
  -0.03941345  0.08181374 -0.0343577  -0.27921174]] 
hs: [[0.         0.         0.         0.16635281 0.         0.23020405
  0.         0.         0.24394943 0.08339416]
 [0.         0.26330663 0.04083425 0.         0.19761423 0.
  0.         0.08181374 0.         0.        ]] 
oa: [[ 0.00049939 -0.00343811 -0.00011125 -0.00608446 -0.00245574 -0.00861154
   0.00049939 -0.00072406 -0.00915555 -0.00280116]
 [-0.00049939 -0.02300026 -0.00398888  0.00142324 -0.01738651  0.0021612
  -0.00049939 -0.00749078  0.00232006  0.00046444]] 
os: [[0.50024969 0.50489038 0.50096941 0.49812308 0.50373262 0.49730684
  0.50024969 0.50169167 0.49713113 0.4991836 ]
 [0.49975031 0.49510962 0.49903059 0.50187692 0.49626738 0.50269316
  0.49975031 0.49830833 0.50286887 0.5008164 ]] 
label: 1.0 
L: [[0.693646

In [1056]:
# bprop in part3.1
'''
def bprop(m,x,y,w1,b1,w2,b2):
    ha = fprop(x,y,w1,b1,w2,b2)[0]
    hs = fprop(x,y,w1,b1,w2,b2)[1]
    os = fprop(x,y,w1,b1,w2,b2)[3]
    grad_oa = os-onehot(m,y)
    grad_w2 = grad_oa*hs.T
    grad_b2 = grad_oa
    grad_hs = w2.T*grad_oa
    grad_ha=[]
    for i in range(dh):
        if (ha[i,0] <= 0) : grad_ha.append(0)
        else: grad_ha.append(grad_hs[i,0])
    grad_ha = np.matrix(grad_ha).T
    grad_w1 = grad_ha*x.T 
    grad_b1 = grad_ha
    return(grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1, grad_b1)
'''
def bprop_k_m(m,x,y,w1,b1,w2,b2,k,i,dh): # add k and i: k is the size of minibatch, and i is the index of starting point.
    ha = fprop_k_m(x,y,w1,b1,w2,b2,k,i)[0]
    hs = fprop_k_m(x,y,w1,b1,w2,b2,k,i)[1]
    os = fprop_k_m(x,y,w1,b1,w2,b2,k,i)[3] 
    grad_oa = os-onehot(m,y[0,i:i+k])
    grad_w2 = grad_oa*hs.T
    grad_b2 = grad_oa
    grad_hs = w2.T*grad_oa
    grad_ha = []
    for j in range(k):
        temp = []
        for m in range(dh):
            if (ha[m,j] <= 0) : temp.append(0)
            else: 
                temp.append(grad_hs[m,j])
        grad_ha.append(temp)
    grad_ha = np.matrix(grad_ha).T
    grad_w1 = grad_ha*x[:,i:i+k].T 
    grad_b1 = grad_ha
    return(np.sum(grad_oa,axis=1)/k, grad_w2, np.sum(grad_b2,axis=1)/k, np.sum(grad_hs,axis=1)/k, 
           np.sum(grad_ha,axis=1)/k, grad_w1, np.sum(grad_b1,axis=1)/k)



##### the shapes of each matrices

In [1041]:
# Here, grad_b1 is dh * 1, grad_w1 is dh * d, grad_b2 is m * 1, grad_w2 is m * dh, 
# grad_ha and grad_hs are dh * 1, grad_oa and grad_os are m * 1

In [1035]:
grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1, grad_b1 = bprop_k_m(m,x,y,w1,b1,w2,b2,1,10)
print('grad_oa:\n',grad_oa,'\ngrad_w2:\n',grad_w2,'\ngrad_b2:\n',grad_b2,'\ngrad_w1:\n',grad_w1,'\ngrad_b1:\n',grad_b1)

grad_oa:
 [[-0.5008164]
 [ 0.5008164]] 
grad_w2:
 [[-0.04176517 -0.        ]
 [ 0.04176517  0.        ]] 
grad_b2:
 [[-0.5008164]
 [ 0.5008164]] 
grad_w1:
 [[-0.01609751  0.0199175 ]
 [-0.          0.        ]] 
grad_b1:
 [[0.02560931]
 [0.        ]]


## 7. Compare both implementations (with a loop and with matrix calculus) to check that they both give the same values for the gradients on the parameters, first for K = 1, then for K = 10. Display the gradients for both methods.

#### K = 1

In [1060]:
# with a loop
k=np.arange(1)
grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1,grad_b1 
= bprop_k_loop(m,x,y,w1,b1,w2,b2,k,dh)
print('grad_oa:\n',grad_oa,'\ngrad_w2:\n',grad_w2,'\ngrad_b2:\n',grad_b2,
      '\ngrad_w1:\n',grad_w1,'\ngrad_b1:\n',grad_b1)

grad_oa:
 [[ 0.50191452]
 [-0.50191452]] 
grad_w2:
 [[ 0.          0.04740948]
 [ 0.         -0.04740948]] 
grad_b2:
 [[ 0.50191452]
 [-0.50191452]] 
grad_w1:
 [[0.         0.        ]
 [0.02594793 0.01131677]] 
grad_b1:
 [[0.        ]
 [0.03538547]]


In [1061]:
# with matrix calculus
grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1, grad_b1 
= bprop_k_m(m,x,y,w1,b1,w2,b2,1,0)
print('grad_oa:\n',grad_oa,'\ngrad_w2:\n',grad_w2,'\ngrad_b2:\n',grad_b2,
      '\ngrad_w1:\n',grad_w1,'\ngrad_b1:\n',grad_b1)

grad_oa:
 [[ 0.50191452]
 [-0.50191452]] 
grad_w2:
 [[ 0.          0.04740948]
 [-0.         -0.04740948]] 
grad_b2:
 [[ 0.50191452]
 [-0.50191452]] 
grad_w1:
 [[0.         0.        ]
 [0.02594793 0.01131677]] 
grad_b1:
 [[0.        ]
 [0.03538547]]


#### K = 10

In [1058]:
# with a loop
k=np.arange(10)
grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1,grad_b1 
= bprop_k_loop(m,x,y,w1,b1,w2,b2,k,dh)
print('grad_oa:\n',grad_oa,'\ngrad_w2:\n',grad_w2,'\ngrad_b2:\n',grad_b2,
      '\ngrad_w1:\n',grad_w1,'\ngrad_b1:\n',grad_b1)

grad_oa:
 [[-0.1993741]
 [ 0.1993741]] 
grad_w2:
 [[-0.03218852 -0.02421717]
 [ 0.03218852  0.02421717]] 
grad_b2:
 [[-0.1993741]
 [ 0.1993741]] 
grad_w1:
 [[-6.70019602e-03 -5.63076924e-05]
 [-9.86571300e-03 -1.98683298e-03]] 
grad_b1:
 [[ 0.00770831]
 [-0.0104821 ]]


In [1059]:
# with matrix calculus
grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1, grad_b1 
= bprop_k_m(m,x,y,w1,b1,w2,b2,10,0)
print('grad_oa:\n',grad_oa,'\ngrad_w2:\n',grad_w2,'\ngrad_b2:\n',grad_b2,
      '\ngrad_w1:\n',grad_w1,'\ngrad_b1:\n',grad_b1)

grad_oa:
 [[-0.1993741]
 [ 0.1993741]] 
grad_w2:
 [[-0.32188522 -0.24217167]
 [ 0.32188522  0.24217167]] 
grad_b2:
 [[-0.1993741]
 [ 0.1993741]] 
grad_w1:
 [[-0.06700196 -0.00056308]
 [-0.09865713 -0.01986833]] 
grad_b1:
 [[ 0.00770831]
 [-0.0104821 ]]


## 8. Time how long takes an epoch on fashion MNIST (1 epoch = 1 full traversal through the whole training set) for K = 100 for both versions (loop over a minibatch and matrix calculus).

In [1132]:
import utils.mnist_reader as mnist_reader
from utils import mnist_reader
XM_train, YM_train = mnist_reader.load_mnist('data/fashion', kind='train')
XM_test, YM_test = mnist_reader.load_mnist('data/fashion', kind='t10k')

In [1133]:
m = 10
dh = 50
train_data = np.matrix(XM_train).T
label = np.matrix(YM_train)
d = np.shape(train_data)[0]
w1 = np.matrix((np.random.uniform(-1/d**0.5,1/d**0.5,d*dh)).reshape(dh,d))
print('w1:\n',w1)
b1 = np.matrix([0]*dh).T
w2 = np.matrix((np.random.uniform(-1/dh*0.5,1/dh*0.5,dh*m)).reshape(m,dh))
print('w2:\n',w2)
b2 = np.matrix([0]*m).T

w1:
 [[ 0.02645721 -0.03125051  0.01103898 ... -0.02781237  0.00446634
   0.01281825]
 [-0.01978466  0.00860278  0.01146674 ...  0.02790571  0.014303
   0.00416316]
 [ 0.02661945 -0.02637931 -0.01687022 ... -0.00319377  0.03562851
   0.01669136]
 ...
 [ 0.01851909  0.02458761  0.01521257 ...  0.0355784   0.01752228
   0.03024079]
 [ 0.02550762 -0.0141292   0.02581921 ... -0.01821978  0.0072019
   0.01949766]
 [ 0.01845047 -0.00039195 -0.01958084 ...  0.03209723 -0.01975679
   0.01429275]]
w2:
 [[ 3.68380004e-03 -4.23990514e-03 -1.33587984e-03  1.46790578e-03
   4.24069070e-03 -4.54284809e-03  3.20926730e-03  1.31413371e-03
   5.41085061e-03 -2.96235425e-03 -4.50725994e-03 -4.59360179e-03
   5.85360522e-03  9.71599497e-03 -9.28450804e-03 -7.85119126e-03
   2.68005146e-03  9.80412704e-04 -9.05699188e-03 -2.24669076e-04
  -8.71509463e-03 -7.25935391e-03  8.04167992e-03 -1.51156573e-03
   2.09905922e-03 -1.94510304e-03  3.02757618e-03 -6.20199877e-04
   5.41397954e-03  1.79552196e-03  5.68

In [1139]:
import time
def mnist_loop_all(x, y, w1, b1, w2, b2, dh, m):
    start = time.time()
    num = int(np.shape(XM_train)[1]/100)
    for i in range(num):
        k = np.arange(i*100, (i+1)*100)
        fprop_k_loop(x,y,w1,b1,w2,b2,k)
        goa_m, gw2_m, gb2_m, ghs_m, gha_m, gw1_m,gb1_m = bprop_k_loop(m,x,y,w1,b1,w2,b2,k,dh)
    end = time.time()
    running_time = end-start
    return running_time


In [1141]:
print('The time for looping all the data by minibatch:',
      mnist_loop_all(train_data, label, w1, b1, w2, b2, dh, m))

The time for loopint all the data by minibatch: 0.6707990169525146


In [1142]:
def mnist_matrix_all(x, y, w1, b1, w2, b2, dh, m):
    start = time.time()
    num = int(np.shape(XM_train)[1]/100)
    for i in range(num):
        fprop_k_m(x,y,w1,b1,w2,b2,100,i*100)
        bprop_k_m(m,x,y,w1,b1,w2,b2,100,i*100)
    end = time.time()
    running_time = end-start
    return running_time    
    
    

In [1143]:
print('The time for using matrix to calculate all the data by minibatch:',
      mnist_loop_all(train_data, label, w1, b1, w2, b2, dh, m))

The time for using matrix to calculate all the data by minibatch: 0.6026718616485596


## 9. Adapt your code to compute the error (proportion of misclassified examples) on the training set as well as the total loss on the training set during each epoch of the training procedure, and at the end of each epoch, it computes the error and average loss on the validation set and the test set. Display the 6 corresponding figures (error and average loss on train/valid/test), and write them in a log file.

In [1147]:
data = np.loadtxt(open('cercles.txt','r'))
n = np.shape(data)[0]
inds = np.arange(n)
np.random.shuffle(inds)
train_inds = inds[:700]
valid_inds = inds[700:900]
test_inds = inds[900:1100]
x_train = np.matrix(data[train_inds,:-1]).T
y_train = np.matrix(data[train_inds,-1])
x_valid = np.matrix(data[valid_inds,:-1]).T
y_valid = np.matrix(data[valid_inds,-1])
x_test = np.matrix(data[test_inds,:-1]).T
y_test = np.matrix(data[test_inds,-1])

In [1202]:
dh = 2
d = 2
m = 2
np.random.seed(1)
w1 = np.matrix((np.random.uniform(-1/d**0.5,1/d**0.5,d*dh)).reshape(dh,d))
b1 = np.matrix([0.0]*dh).T
w2 = np.matrix((np.random.uniform(-1/dh*0.5,1/dh*0.5,dh*m)).reshape(m,dh))
b2 = np.matrix([0.0]*m).T

In [1184]:
# here we choose matrix calculate, because it's faster than looping
# modifier fprop to add error
def fprop_error(x,y,w1,b1,w2,b2,k,i): # add k and i: k is the size of minibatch, and i is the index of starting point.
    b1_m = prol_k(b1.T,k).T # change b1 from dh*1 to dh*k
    ha = w1*x[:,i:i+k]+b1 # x from a vectot of d*1 change to a matrix of d*k, so b1 must change to dh*k to match
    hs = rect(ha)
    b2 = prol_k(b2.T,k).T #change b2 from m*1 to m*k
    oa = w2*hs+b2 # hs from a vectot of dh*1 change to a matrix of dh*k, so b2 must change to m*k to match
    os = softmax(oa)
    err = 0
    for j in range(k):
        if (os[0,j] >= os[1,j]):
            pre_y = 0
        else:
            pre_y = 1
        if pre_y != y[0,i+j]:
            err += 1
    lable = []  # lable and temp are used for choosing the value of os to calculate loss
    temp = []
    for j in range(k):
        lable.append(int(y[0,i+j]))
        temp.append(j)
    L = -np.log(os[lable,temp])
    L_sum = np.sum(L)/k
    return(ha, hs, oa, os, L, L_sum,err)

In [1208]:
def L_theta(lamda,w1,w2,dh,m,d):
    if len(lamda) != 4 : return "Lamda is error!"
    else:
        w1=w1.reshape(1,dh*d)
        w11 = np.sum(abs(w1))
        w12 = np.linalg.det(w1*w1.T)
        w2=w2.reshape(1,dh*m)
        w21 = np.sum(abs(w2))
        w22 = np.linalg.det(w2*w2.T)
    W = [w11,w12,w21,w22]
    return np.linalg.det(lamda*np.matrix(W).T)

def train(x,y,w1,b1,w2,b2,k,lamda,dh,d,m,eta,epsilon): # k is size of minibatch, eta is hyperparameter
    m, n = np.shape(x)
    sum_loss = 0
    err_sum = 0
    Rmin = 10000
    R = 0
    itera = 0
    err_prop = 0
    grad_b2 = 1
    num = int(n/k)
    while((itera < 1000 )& (np.sum(grad_b2) > epsilon)):
        for i in range(num):
            ha, hs, oa, os, L, loss, err = fprop_error(x,y,w1,b1,w2,b2,k,i*k)
            sum_loss += loss*k
            err_sum += err   
        R = (1/n)*sum_loss+L_theta(lamda,w1,w2,dh,m,d)
        if R < Rmin:
            Rmin = R
            w1min = w1
            w2min = w2
            b1min = b1
            b2min = b2
            sum_loss_result = sum_loss
            err_result = err_sum
            for i in range(num):
                grad_oa, grad_w2, grad_b2, grad_hs, grad_ha, grad_w1, grad_b1 = bprop_k_m(m,x,y,w1,b1,w2,b2,k,i*k)
                w1 -= eta*grad_w1
                b1 -= eta*grad_b1
                w2 -= eta*grad_w2
                b2 -= eta*grad_b2
            itera += 1
        else: 
            break
    if itera == 1000 :
        print(' not finish yet...')
    return (Rmin, w1min, w2min, b1min, b2min, sum_loss_result/n, err_result/n)

In [1209]:
Rmin, w1min, w2min, b1min, b2min, loss_avg, err_prop = train(x_train,y_train,w1,b1,w2,b2,100,[0.01,0.01,0.01,0.01],dh,d,m,0.001,0.001)

In [1210]:
print(loss_avg)
print(err_prop)

0.694757623537982
0.5042857142857143


In [1211]:
Rmin, w1min, w2min, b1min, b2min, loss_avg, err_prop = train(x_valid,y_valid,w1,b1,w2,b2,100,[0.01,0.01,0.01,0.01],dh,d,m,0.001,0.001)
print(loss_avg)
print(err_prop)

0.6945979647817484
0.53


In [1212]:
Rmin, w1min, w2min, b1min, b2min, loss_avg, err_prop = train(x_test,y_test,w1,b1,w2,b2,100,[0.01,0.01,0.01,0.01],dh,d,m,0.001,0.001)
print(loss_avg)
print(err_prop)

0.6934102248402416
0.48
