In [1]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
def sigmoid(Z):
    A = 1/(1+np.exp(-Z))
    cache = Z
    return A, cache
def sigmoid_backward(dA, cache):
    Z = cache
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    assert (dZ.shape == Z.shape)
    return dZ
def relu(Z):
    A = np.maximum(0,Z)
    assert(A.shape == Z.shape)
    cache = Z 
    return A, cache
def relu_backward(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    assert (dZ.shape == Z.shape)
    return dZ
def load_dataset():
    train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
    train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
    train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels

    test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
    test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
    test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels

    classes = np.array(test_dataset["list_classes"][:]) # the list of classes
    
    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes
    

In [53]:
# 初始化参数
def initialize_parameters(n_x,n_h,n_y):
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))
    parameters = {"w1": W1,
                  "b1": b1,
                  "w2": W2,
                  "b2": b2}
    
    return parameters  
# 初始化参数——针对深层神经网络
def initialize_parameters_deep(layers_dims):
    np.random.seed(3)
    parameters = {}
    l = len(layers_dims)
    
    for i in range(1,l):
        parameters['w'+str(i)] = np.random.randn(layers_dims[i],layers_dims[i-1]) / np.sqrt(layers_dims[i-1])
        parameters['b'+str(i)] = np.zeros(shape = (layers_dims[i],1))
    return parameters

In [54]:
# 前向传播
# 计算z
def linear_forward(A,w,b):
    z = np.dot(w,A) + b
    cache = (A,w,b)
    return z,cache
# 计算A
def linear_activation_forward(A_pre,w,b,activation):
    if activation == 'relu':
        z,linear_cache = linear_forward(A_pre,w,b)
        A,activation_cache = relu(z)
    if activation == 'sigmoid':
        z,linear_cache = linear_forward(A_pre,w,b)
        A,activation_cache = sigmoid(z)
    cache = (linear_cache,activation_cache)
    return A,cache
# 深层的模型前向传播
def L_model_forward(X,parameters):
    L = len(parameters) // 2
    A = X
    caches = []
    for l in range(1,L):
        A_pre = A
        A,cache = linear_activation_forward(A_pre,parameters['w'+str(l)],parameters['b'+str(l)],'relu')
        caches.append(cache)
    A_L,cache = linear_activation_forward(A_pre,parameters['w'+str(l)],parameters['b'+str(l)],'sigmoid')
    caches.append(cache)
    return A_L,caches
# 计算损失
def compute_cost(A_L,y):
    m = y.shape[1]
    cost = -np.sum(np.multiply(np.log(A_L),y) + np.multiply(np.log(1 - A_L), 1 - y)) / m
        
    cost = np.squeeze(cost)
    return cost

In [55]:
# 线性部分反向
def linear_backward(dz,cache):
    # output layer
    A_pre,w,b = cache
    m = A_pre.shape[1]
    # dz为第l层的线性输出成本梯度
    dw = (1/m) * np.dot(dz,A_pre.T)
    db = (1/m) * np.sum(dz,axis = 1,keepdims = True)
    dA_pre = np.dot(w.T,dz) 
    
    return dA_pre,dw,db
# 激活函数部分反向
def linear_activation_backward(dA,cache,activation = 'relu'):
    # cache记录了用于有效计算反向传播的元组
    linear_cache,activation_cache = cache
    if activation == 'relu':
        dz = relu_backward(dA,activation_cache)
        dA_pre,dw,db = linear_backward(dz,linear_cache)
    if activation == 'sigmoid':
        dz = sigmoid_backward(dA,activation_cache)
        dA_pre,dw,db = linear_backward(dz,linear_cache)
    return dA_pre,dw,db

def L_model_backward(A_L,y,caches):
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    dAL = -np.divide(y,A_L) + np.divide(1-y,1-A_L)
    
    current_cache = caches[L-1]
    grads['dA'+str(L)],grads['dw' + str(L)],grads['db'+str(L)] = linear_activation_backward(dAL,current_cache,'sigmoid')
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_temp,dw_temp,db_temp = linear_activation_backwarde(grads['dA'+ str(l+2)],current_cache)
        grads['dA'+str(l+1)] = dA_temp
        grads['dw'+str(l+1)] = dw_temp
        grads['db'+str(l+1)] = db_temp
    return grads
# 更新参数
def update_parameters(parameters,grads,learning_rate = 0.1):
    L = len(parameters) // 2
    # print(grads)
    for i in range(L):
        parameters['w'+str(i+1)] -= learning_rate * grads['dw'+str(i+1)]
        parameters['b'+str(i+1)] -= learning_rate * grads['db'+str(i+1)]
    return parameters

In [56]:
# build_model
def two_layer_model(X,y,layers_dims,learning_rate = 0.0075,num_iterations = 1000,print_cost = False):
    np.random.seed(1)
    grads = {}
    costs = []
    input_layers,hidden_layers,output_layers = layers_dims
    # init操作
    parameters = initialize_parameters(input_layers,hidden_layers,output_layers)
    
    w1 = parameters['w1']
    w2 = parameters['w2']
    b1 = parameters['b1']
    b2 = parameters['b2']
    # print(w1.shape,w2.shape,b1.shape,b2.shape)
    for step in range(num_iterations):
        # forward
        A1,cache1 = linear_activation_forward(X,w1,b1,'relu')
        A2,cache2 = linear_activation_forward(A1,w2,b2,'sigmoid')
        
        # costs
        # print(A1.shape)
        # print(A2.shape)
        cost = compute_cost(A2,y)
        
        # backward
        
        dA2 = -np.divide(y,A2)+np.divide(1-y,1-A2)
        
        dA1,dw2,db2 = linear_activation_backward(dA2,cache2,'sigmoid')
        dA0,dw1,db1 = linear_activation_backward(dA1,cache1,'relu')
        grads['dw1'] = dw1
        grads['dw2'] = dw2
        grads['db1'] = db1
        grads['db2'] = db2
        # print(parameters)
        
        parameters = update_parameters(parameters,grads,learning_rate = learning_rate)
        w1 = parameters['w1']
        w2 = parameters['w2']
        b1 = parameters['b1']
        b2 = parameters['b2']
        # print(w1)
        
        if step % 100 == 0:
            print(cost)
            costs.append(cost)

In [57]:
train_x,train_y,test_x,test_y,classes = load_dataset()
print(train_x.shape,test_x.shape,train_y.shape,test_y.shape)
train_x = train_x.reshape(train_x.shape[0],-1).T
test_x = test_x.reshape(test_x.shape[0],-1).T
print(train_x.shape,test_x.shape)
train_x = train_x / 255
test_x = test_x / 255

(209, 64, 64, 3) (50, 64, 64, 3) (1, 209) (1, 50)
(12288, 209) (12288, 50)


In [58]:
input_layers = train_x.shape[0]
hidden_layers = 7
output_layers = train_y.shape[0]
two_layer_model(train_x,train_y,[input_layers,hidden_layers,output_layers])

0.6930497356599891
0.6464320953428849
0.6325140647912677
0.6015024920354665
0.5601966311605748
0.515830477276473
0.47549013139433266
0.4339163151225749
0.400797753620389
0.35807050113237987
