In [1]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
from tools import *
from PIL import Image
from scipy import ndimage
from lr_utils import load_dataset
from testCases_v3 import *

plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'


np.random.seed(1)
"""
nx
sizes_of_layers = [nx, ...., 1]
"""




'\nnx\nsizes_of_layers = [nx, ...., 1]\n'

In [2]:
def relu(Z):
    
    A = np.maximum(0, Z)
    assert(A.shape == Z.shape)

    cache = Z
    return A

def sigmoid(Z):
    
    A = 1/(1 + np.exp(-Z))
    assert(A.shape == Z.shape)

    cache = Z
    return A

#* dZ = dA* g'(Z)
def back_relu(dA, Z):
    
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0 ] = 0

    return dZ

def back_sigmoid(dA, Z):

    s = 1/(1 + np.exp(- Z))
    dZ = dA * s * (1 - s)

    return dZ

## The cache of forward prop and backward prop:

Forward prop: cache for lth layer = $[( \text{A}^{[l]}, \text{W}^{[l]}, \text{b}^{[l]}, \text{Z}^{[l]}), \dots, ]$

Backward prop: grads for lth layer = $\{ \text{dA}^{[l - 1]}, \text{dW}^{[l]}, \text{db}^{[l]} \}$



In [3]:

#* initialize the parameters: W and b

def initialize_parameter(sizes_of_layers):
    np.random.seed(1)
    parameters = {}
    #* only concern about the layer 2 to layer L, layer 1 is the input layer
    for i in range(1, len(sizes_of_layers)):
        parameters["W" + str(i)] = np.random.rand(sizes_of_layers[i], sizes_of_layers[i - 1]) / np.sqrt(sizes_of_layers[i - 1])
        parameters["b" + str(i)] = np.zeros((sizes_of_layers[i], 1))

    return parameters

**Notice:**

For the function which returns several variables, if you assign the function to single variable, then this variable will be Tulpe, namely `(variable1 , variable2 , ...)`

**The structure of caches:**
```python
    caches[l] = (Al, Wl, bl, Zl, A_prev)
```

Which means it contians the input layer.

In [4]:

#* forward
def forward_porpogate(X, parameters, activation):

    caches = []

    activation_dict = {"relu": relu,
                       "sigmoid": sigmoid}
    
    activation_fun = activation_dict[activation]

    L = len(parameters) // 2 # the number of layers, input layer doesn't contain

    A = X
    for i in range(1, L):
        A_prev = A
        W = parameters["W" + str(i)]
        b = parameters["b" + str(i)]
        Z = np.dot(W, A_prev) + b
        A = activation_fun(Z)
        caches.append((A, W, b, Z, A_prev))
        
        '''
        print ("A_prev" + str(A_prev.shape))
        print ("W" + str(W.shape))
        print ("b" + str(b.shape))
        print ("Z" + str(Z.shape))
        print ("A" + str(A.shape))
        '''


    WL = parameters["W" + str(L)]
    bL = parameters["b" + str(L)]
    ZL = np.dot(WL, A) + bL
    AL = sigmoid(ZL)
    
    '''
    print ("A" + str(A.shape))
    print ("WL" + str(WL.shape))
    print ("bL" + str(bL.shape))
    print ("ZL" + str(ZL.shape))
    print ("AL" + str(AL.shape))
    '''

    caches.append((AL, WL, bL, ZL, A))

    return caches


In [5]:
X, parameters = L_model_forward_test_case_2hidden()
caches = forward_porpogate(X, parameters, "relu")
AL, WL, bL, ZL, A = caches[len(caches) - 1]
print("AL = " + str(AL))
print("Length of caches list = " + str(len(caches)))

AL = [[0.03921668 0.70498921 0.19734387 0.04728177]]
Length of caches list = 3


In [6]:

#* cost function
def cost_fun(A, Y):
    m = Y.shape[1]

    cost = -(1./m) * ( np.dot(Y, np.log(A.T)) + np.dot( 1 - Y , np.log( 1 - A ).T ) )
    cost = np.squeeze(cost)
    return cost

#*backward
def back_propogate(Y, caches, activation):

    activation_dict={"relu": back_relu,
                     "sigmoid": back_sigmoid}
    
    activation_fun = activation_dict[activation]


    #* caches - AL WL bL ZL A_prev
    grads = {}

    m = Y.shape[1]

    L = len(caches) # the number of layers, input layer doesn't contain
    
    AL, WL, bL, ZL, AL_prev = caches[L - 1]

    dAL = -(np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    dZL = back_sigmoid(dAL, ZL)
    dWL = (1/m) * np.dot(dZL, AL_prev.T)
    dbL = (1/m) * np.sum(dZL, axis = 1, keepdims = True)
    dAL_next = np.dot(WL.T, dZL)

    grads["dW" + str(L)] = dWL
    grads["dA" + str(L)] = dAL_next #! it denots dA^l-1 !!!
    grads["db" + str(L)] = dbL

    for i in reversed(range(L - 1 )):
        A, W, b, Z, A_prev = caches[i]

        dA_current = grads["dA" + str(i + 2)]
        dZ = activation_fun(dA_current, Z)
        dW = (1/m) * np.dot(dZ, A_prev.T)
        db = (1/m) * np.sum(dZ, axis = 1, keepdims = True)
        dA_next = np.dot(W.T, dZ)

        grads["dW" + str(i + 1)] = dW
        grads["dA" + str(i + 1)] = dA_next #! it denots dA^l-1 !!!
        grads["db" + str(i + 1)] = db

    return grads

In [7]:
Y, caches = L_model_backward_test_case()
grads = back_propogate(Y, caches, "relu")
print_grads(grads)

dW1 = [[0.41010002 0.07807203 0.13798444 0.10502167]
 [0.         0.         0.         0.        ]
 [0.05283652 0.01005865 0.01777766 0.0135308 ]]
db1 = [[-0.22007063]
 [ 0.        ]
 [-0.02835349]]
dA1 = [[ 0.12913162 -0.44014127]
 [-0.14175655  0.48317296]
 [ 0.01663708 -0.05670698]]


In [8]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2

    for i in range(1 , L + 1):
        parameters["W" + str(i)] = parameters["W" + str(i)] - learning_rate * grads["dW" + str(i)]
        parameters["b" + str(i)] = parameters["b" + str(i)] - learning_rate * grads["db" + str(i)]

    return parameters

In [9]:
parameters, grads = update_parameters_test_case()
parameters = update_parameters(parameters, grads, 0.1)

print ("W1 = "+ str(parameters["W1"]))
print ("b1 = "+ str(parameters["b1"]))
print ("W2 = "+ str(parameters["W2"]))
print ("b2 = "+ str(parameters["b2"]))

W1 = [[-0.59562069 -0.09991781 -2.14584584  1.82662008]
 [-1.76569676 -0.80627147  0.51115557 -1.18258802]
 [-1.0535704  -0.86128581  0.68284052  2.20374577]]
b1 = [[-0.04659241]
 [-1.28888275]
 [ 0.53405496]]
W2 = [[-0.55569196  0.0354055   1.32964895]]
b2 = [[-0.84610769]]


## The Data Normalization:

Nomalizse the data, for RGB matrix, just divide it by 255.

In [10]:
train_set_x_orig, train_set_y, test_set_x_orig, test_set_y, classes = load_dataset()

x_set_train_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T
x_set_test_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T

#* normoliaze
train_x =x_set_train_flatten/255.
test_x = x_set_test_flatten/255.

print ("sb")

sb


In [11]:
def L_layer_model_training(X, Y, sizes_of_layers, activation, learning_rate, iteration_num, print_cost = True):

    #* check whether the size of the first and last layer are set correctly.
    assert sizes_of_layers[0] == X.shape[0], "the size of the first layer does not match the input layer."
    assert sizes_of_layers[-1] == 1, "the output is a value not array!"

    parameters = initialize_parameter(sizes_of_layers)

    cost_list = []

    for i in range(0, iteration_num ):
        caches = forward_porpogate(X, parameters, activation)
        grads = back_propogate(Y, caches, activation)
        parameters = update_parameters(parameters, grads, learning_rate)

        AL, WL, bL, ZL, A = caches[-1]
        cost = cost_fun(AL, Y)

        if print_cost and i % 100 == 0:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
            print (grads)
        if print_cost and i % 100 == 0:
            cost_list.append(cost)

    return parameters


In [12]:
activation = "relu"
learning_rate = 0.0075
sizes_of_layers = [12288, 20, 7, 5, 1]

parameters = initialize_parameter(sizes_of_layers)
print(parameters)
caches = forward_porpogate(train_x, parameters, activation)
grads = back_propogate(train_set_y, caches, activation)
parameters = update_parameters(parameters, grads, learning_rate)

AL, WL, bL, ZL, A = caches[-1]
cost = cost_fun(AL, train_set_y)

#print(AL)
#print(cost)
#print(grads)

{'W1': array([[3.76199635e-03, 6.49811782e-03, 1.03178643e-06, ...,
        3.04518816e-03, 1.01299851e-03, 4.84475558e-03],
       [6.38634454e-03, 3.04665939e-03, 6.38652384e-03, ...,
        3.80727150e-03, 7.02376566e-03, 3.15617564e-03],
       [8.13106245e-03, 7.17297128e-05, 9.36182785e-04, ...,
        2.51329999e-03, 4.56298881e-04, 7.45890390e-03],
       ...,
       [3.35517384e-03, 1.33177746e-03, 6.85927115e-03, ...,
        5.37530908e-03, 1.06717863e-03, 5.89110510e-03],
       [6.56081722e-03, 6.35846413e-03, 2.18256253e-03, ...,
        5.03250890e-04, 8.44477676e-03, 7.23351825e-03],
       [8.30425485e-03, 3.05953392e-03, 7.87889731e-03, ...,
        6.20811200e-03, 8.15059858e-03, 4.41814872e-04]]), 'b1': array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]]), 'W2

  dAL = -(np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
  dAL = -(np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
  dZ = dA * s * (1 - s)
  cost = -(1./m) * ( np.dot(Y, np.log(A.T)) + np.dot( 1 - Y , np.log( 1 - A ).T ) )


In [13]:
'''sizes_of_layers = [12288, 20, 7, 5, 1]
iteration_num = 3000
learning_rate = 0.0075
activation = "relu"

print(train_x.shape[1])

L_layer_model_training(train_x, train_set_y, sizes_of_layers, activation, learning_rate, iteration_num, print_cost = True)'''

'sizes_of_layers = [12288, 20, 7, 5, 1]\niteration_num = 3000\nlearning_rate = 0.0075\nactivation = "relu"\n\nprint(train_x.shape[1])\n\nL_layer_model_training(train_x, train_set_y, sizes_of_layers, activation, learning_rate, iteration_num, print_cost = True)'