In [1]:
def normalize_photo(X):
    """
    each pixel has range [0, 255]. Normalize it to [-1, 1]
    
    X:   Input photo
    """
    return (X - 128.0 / 128.0)

In [2]:
def initilize_parameters(layer_dims, activations):
    """
    layer_dims: [n_x, n_1, n_2, ..., n_y]
    activations: [not_used, relu, relu, ..., sigmoid]
    """
    L = len(layer_dims) - 1
    
    W = [l for l in range(L + 1)] # W[0] - W[L], W[0] is not used
    b = [l for l in range(L + 1)] # b[0] - b[L], b[0] is not used
    
    # Initialize parameters
    for l in range(1, L + 1): # 1 - L
        if activations[l] == "relu":        
            norm = np.sqrt(2.0 / layer_dims[l-1]) # He Initialization, He et al., 2015
        elif activations[l] == "tanh":
            norm = 0.01
        elif activations[l] == "sigmoid":
            norm = 0.01
        else:
            norm = 1
        if 'debug' in globals() and debug:            
            print("layer", l, "[", layer_dims[l], layer_dims[l-1], "]", activations[l], norm)
            
        W[l] = np.random.randn(layer_dims[l], layer_dims[l-1]) * norm # (n[l], n[l-1])
        b[l] = np.zeros((layer_dims[l], 1))
        
    return W, b

In [3]:
import numpy as np 
np.random.seed(1)
debug=True
W, b = initilize_parameters([4, 2, 3], ["placeholder", "relu", "sigmoid"])
W

layer 1 [ 2 4 ] relu 0.707106781187
layer 2 [ 3 2 ] sigmoid 0.01


[0, array([[ 1.14858562, -0.43257711, -0.37347383, -0.75870339],
        [ 0.6119356 , -1.62743362,  1.23376823, -0.53825456]]), array([[ 0.00319039, -0.0024937 ],
        [ 0.01462108, -0.02060141],
        [-0.00322417, -0.00384054]])]

In [None]:
b

[0, array([[ 0.],
        [ 0.]]), array([[ 0.],
        [ 0.],
        [ 0.]])]

In [5]:
def forward_propagation(X, W, b, activations, iter_i=-1):
    """
    X: Input
    W: [not_used, W1, W2, ..., WL]
    b: [not_used, b1, b2, ..., bL]
    activations: [not_used, relu, relu, ..., sigmoid]    
    """
    L = len(W) - 1
    
    Z  = [l for l in range(L+1)]
    A  = [l for l in range(L+1)]
    
    A[0] = X
    
    for l in range(1, L+1):
        Z[l] = np.dot(W[l], A[l - 1]) + b[l] # (n[l], m) <= (n[l], n[l-1]) . (n[l-1], m) + (n[l], 1)
        
        
        A[l] = np.tanh(Z[l])
        if activations[l] == "relu":        
            A[l] = np.max(0, Z[l])
        elif activations[l] == "tanh":
            A[l] = np.tanh(Z[l])
        elif activations[l] == "sigmoid":
            A[l] = 1.0 / (1.0 + np.exp(-Z[l]))
        else:
            raise Exception("activation " + activations[l] + "not supported")
            
        if iter_i == 0 and 'debug' in globals() and debug:
            shape_info = "Z[l] = np.dot(W[l], A[l - 1]) + b[l] {shape1} <= {shape2} . {shape3} + {shape4}".format(
                shape1=Z[l].shape, shape2=W[l].shape, shape3=A[l-1].shape, shape4=b[l].shape)
            print(l, shape_info)
            
    return A, Z

In [6]:
def backward_propagation(X, A, Y, W, b, activations, iter_i=-1, lambd=0):
    n_x, m = X.shape
    L = len(W) - 1

    dA = [l for l in range(L + 1)] # index 0 not used
    dZ = [l for l in range(L + 1)]
    dW = [l for l in range(L + 1)]
    db = [l for l in range(L + 1)]

    
    # Backward propagation for the last layer
    if activations[L] == "sigmoid":
        dA[L] = -(np.divide(Y, A[L]) - np.divide(1 - Y, 1 - A[L]))
        # The way to calculate dZL is different than other layers, due to different activation function
        dZ[L] = A[L] - Y
    else:
        raise Exception("activation " + activations[L] + "not supported")
        
    dW[L] = np.dot(dZ[L], A[L-1].T) / m  + lambd * W[L] / m
    db[L] = np.sum(dZ[L], axis=1, keepdims=True)

    # Backward propagation for other layers
    for l in reversed(range(1, L)):
        if iter_i == 0 and 'debug' in globals() and debug:
            print(l, "W[l].shape", W[l].shape, "dZ[l+1].shape", dZ[l+1])

        if activations[l] == "tanh":
            dgZl = 1 - np.power(A[l], 2)
            dZ[l] = np.multiply(np.dot(W[l+1].T, dZ[l+1]), dgZl)
#         elif activations[l] == "relu":
        else:
            raise Exception("activation " + activations[L] + "not supported")
            
        dW[l] = np.dot(dZ[l], A[l-1].T) / m + lambd * W[l] / m
        db[l] = np.sum(dZ[l], axis=1, keepdims=True)
        
    return dW, db

In [7]:
def calculate_cost(W, AL, Y, lambd=0):
    n_y, m = Y.shape
        
    # calculate cost
    
    cross_entropy_cost = -(np.dot(Y, np.log(AL.T)) + np.dot(1-Y, np.log(1-AL.T))) / m
    
    regulation = 0
    if lambd > 0:
        for l in range(1, len(W)):
            regulation += np.sum(W[l])
            
    return np.squeeze(cross_entropy_cost) + regulation / m

In [13]:
def neural_network(X, Y, test_set_x, test_set_y, hidden_layer_dims, activations, 
                   num_iterations=10, learning_rate=0.01, early_stop_cost=0., lambd=0):
    """
    X:                 train input
    Y:                 train labels
    hidden_layer_dims: all hidden layer units
    """
    n_x, m = X.shape
    n_y, _ = Y.shape

    # And input layer, and output layer
    layer_dims = [n_x] + hidden_layer_dims + [n_y] 
    L = len(layer_dims) - 1

    activations.insert(0, "not_used")
        
    W, b = initilize_parameters(layer_dims, activations)


    
    # print one param for debug
    if 'debug' in globals() and debug:
        print("init weights", W[1][0][0], W[2][0][0])
    
    costs = []
    for i in range(num_iterations):
        A, Z = forward_propagation(X, W, b, activations)
        dW, db = backward_propagation(X, A, Y, W, b, activations)
            
        if i == 0 and 'debug' in globals() and debug:
            print(i, " derivitive", dW[1][0][0], dW[2][0][0])
            print(i, " derivitive", db[1][0][0], db[2][0][0])

        # update parameters
        for l in range(1, L + 1):
            W[l] = W[l] - learning_rate * dW[l]
            b[l] = b[l] - learning_rate * db[l]
            
        if i == 0 and 'debug' in globals() and debug:        
            print(i, "weights", W[1][0][0])

        cost = calculate_cost(W, A[L], Y, lambd)
        costs.append(cost)
        
        if cost < early_stop_cost:
            break
            
        if i % 100 == 0:
            train_predict, train_accuracy, test_predict, test_accuracy = cat_utils.accuracy_n_layers(
                W, b, test_set_x, test_set_y, A[L], train_set_y)
                
            print("epoch", i, "cost", cost, "train accuracy", train_accuracy, "test accuracy", test_accuracy)

    print(i, cost)
    
    return W, b, A, i, costs

In [14]:
import time
import cat_utils
import numpy as np

train_set_x, train_set_y, test_set_x, test_set_y, classes = cat_utils.load_normalized_dataset()
start = time.time()

np.random.seed(1)
W, b, A, i, costs = neural_network(train_set_x, train_set_y, test_set_x, test_set_y, hidden_layer_dims=[1000, 20],
                                activations=["tanh", "tanh", "sigmoid"],
                                num_iterations = 20001, learning_rate = 0.01, early_stop_cost=0.1, lambd=0.1)

print("total training time", time.time() - start)

layer 1 [ 1000 12288 ] tanh 0.01
layer 2 [ 20 1000 ] tanh 0.01
layer 3 [ 1 20 ] sigmoid 0.01
init weights 0.0162434536366 0.00802711095375
0  derivitive -7.87063390191e-06 -0.000868145506998
0  derivitive -0.00651203787262 -0.352698735691
0 weights 0.016243532343
epoch 0 cost 0.760397016032 train accuracy 0.650717703349 test accuracy 0.34
epoch 100 cost 0.710480814536 train accuracy 0.655502392344 test accuracy 0.34
epoch 200 cost 0.709297703292 train accuracy 0.655502392344 test accuracy 0.34
epoch 300 cost 0.70759692335 train accuracy 0.655502392344 test accuracy 0.34
epoch 400 cost 0.704188997037 train accuracy 0.655502392344 test accuracy 0.34
epoch 500 cost 0.696928691728 train accuracy 0.655502392344 test accuracy 0.34
epoch 600 cost 0.682709396277 train accuracy 0.655502392344 test accuracy 0.34
epoch 700 cost 0.658332287044 train accuracy 0.655502392344 test accuracy 0.34
epoch 800 cost 0.620683752621 train accuracy 0.66028708134 test accuracy 0.4
epoch 900 cost 0.566373932742 

In [17]:
test_set_x.shape

(12288, 50)

In [15]:
import time
import cat_utils
import numpy as np

train_set_x, train_set_y, test_set_x, test_set_y, classes = cat_utils.load_normalized_dataset()
start = time.time()

np.random.seed(1)
W, b, A, i, costs = neural_network(train_set_x, train_set_y, test_set_x, test_set_y, hidden_layer_dims=[1000, 20],
                                activations=["tanh", "tanh", "sigmoid"],
                                num_iterations = 20001, learning_rate = 0.01, early_stop_cost=0.1, lambd=0.3)

print("total training time", time.time() - start)

layer 1 [ 1000 12288 ] tanh 0.01
layer 2 [ 20 1000 ] tanh 0.01
layer 3 [ 1 20 ] sigmoid 0.01
init weights 0.0162434536366 0.00802711095375
0  derivitive -7.87063390191e-06 -0.000868145506998
0  derivitive -0.00651203787262 -0.352698735691
0 weights 0.016243532343
epoch 0 cost 0.760397016032 train accuracy 0.650717703349 test accuracy 0.34
epoch 100 cost 0.710480814536 train accuracy 0.655502392344 test accuracy 0.34
epoch 200 cost 0.709297703292 train accuracy 0.655502392344 test accuracy 0.34
epoch 300 cost 0.70759692335 train accuracy 0.655502392344 test accuracy 0.34
epoch 400 cost 0.704188997037 train accuracy 0.655502392344 test accuracy 0.34
epoch 500 cost 0.696928691728 train accuracy 0.655502392344 test accuracy 0.34
epoch 600 cost 0.682709396277 train accuracy 0.655502392344 test accuracy 0.34
epoch 700 cost 0.658332287044 train accuracy 0.655502392344 test accuracy 0.34
epoch 800 cost 0.620683752621 train accuracy 0.66028708134 test accuracy 0.4
epoch 900 cost 0.566373932742 

In [146]:
train_predict, train_accuracy, test_predict, test_accuracy = cat_utils.accuracy_n_layers(
    W, b, test_set_x, test_set_y, A[len(W)-1], train_set_y)

train accuracy 0.995215311005
test accuracy 0.72


In [148]:
train_predict, train_accuracy, test_predict, test_accuracy = cat_utils.accuracy_n_layers(
    W, b, test_set_x, test_set_y, A[len(W)-1], train_set_y)

train accuracy 0.885167464115
test accuracy 0.54


In [17]:
rn=np.random.randn(3, 4)

In [18]:
r=np.random.rand(3, 4)

In [19]:
np.linalg.norm(r)

2.0338995437889325

In [20]:
a = np.arange(0,27,3).reshape(3,3)

result = a / np.linalg.norm(a, axis=-1)[:, np.newaxis]

In [21]:
result

array([[ 0.        ,  0.4472136 ,  0.89442719],
       [ 0.42426407,  0.56568542,  0.70710678],
       [ 0.49153915,  0.57346234,  0.65538554]])

In [22]:
a

array([[ 0,  3,  6],
       [ 9, 12, 15],
       [18, 21, 24]])

In [23]:
np.linalg.norm(a, axis=-1)

array([  6.70820393,  21.21320344,  36.61966685])

In [27]:
np.sum(np.power(a[:, 0], 2))

405

In [28]:
np.linalg.norm(a)

42.848570571257099

In [29]:
train_set_x

array([[ 0.06666667,  0.76862745,  0.32156863, ...,  0.56078431,
         0.08627451,  0.03137255],
       [ 0.12156863,  0.75294118,  0.27843137, ...,  0.60784314,
         0.09411765,  0.10980392],
       [ 0.21960784,  0.74509804,  0.26666667, ...,  0.64705882,
         0.09019608,  0.20784314],
       ..., 
       [ 0.        ,  0.32156863,  0.54117647, ...,  0.33333333,
         0.01568627,  0.        ],
       [ 0.        ,  0.31372549,  0.55294118, ...,  0.41960784,
         0.01960784,  0.        ],
       [ 0.        ,  0.31764706,  0.55686275, ...,  0.58431373,
         0.        ,  0.        ]])

In [30]:
train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes = cat_utils.load_dataset()

In [34]:
train_set_x_orig.shape

(209, 64, 64, 3)

In [35]:
train_set_x_orig[0][0][0]

array([17, 31, 56], dtype=uint8)

In [40]:
np.mean(train_set_x_orig, axis=0).shape

(64, 64, 3)

In [47]:
mean = np.mean(train_set_x_orig, axis=0)[0][0]

In [48]:
sigma = np.sum(np.power(train_set_x_orig, 2), axis=0) / train_set_x_orig.shape[0]

In [50]:
normed = (train_set_x_orig - mean)/sigma

In [51]:
normed.shape

(209, 64, 64, 3)

In [56]:
normed[1][10][2]

array([ 0.58989828,  0.16329855,  0.40641656])