In [None]:
from numpy import (exp, log, mean, std, pi, dot, sqrt, random, arange, array, 
                   unique, zeros, argmax, append, pad, eye, max, squeeze, 
                   multiply, sum, divide)
### Q1: your code starts here. 

# Define model structure
# each image is 2D, 62x47, with each element defining pixel intensity (0, 255)
# since image isn't nxn, we can apply padding to the cols - if needed
# image is grayscale input -> depth = 1, input = 64*47 per image

# output is # classes or unique y_values

# apply(convolve) filter (n x n block) to image -> feature map
# loop til min value of cost function -> forward propagation = current loss

### padding if needed
def padding(x):
    x_pad = []
    if x.shape[1] != x.shape[2]:
    diff = x.shape[1] - x.shape[2]
    for i in range(0, x.shape[0]):
        x_pad.append(pad(x[i], [(0,0), (0,diff)], mode='constant'))
    return array(x_pad)
  return x

### activation/cost functions
#prob norm-dist
def p(x, y):
    sig = std(y)
    mu = mean(y)
    return (1.0/(sig*(2*pi)**(0.5)))*exp(-0.5*((x-mu)/sig)**2)
 
def logloss(y):
    N = len(y)
    init = 0
    for i in range(1, N):
    init = init + y[i]*log(p(y[i],y)) + 1-y[i]*log(1-p(y[i], y))
    return (1.0/-N)*init

def sigmoid(z, derivative=False):
    sig = 1./(1.+exp(-z))
    if derivative:
        return sig*(1-sig)
    return sig, z

def Square_loss(A,Y,derivative=False):
"""Compute square loss or its derivative"""
  error = A-Y
    if derivative:
        return error

    cost = 1/2*dot(error, error) 
    return cost

def Logistic_loss(A,Y, derivative=False):
"""Compute logistic loss or its derivative"""
  if derivative:
    num = A - Y
    dem = A * (1-A) # this is equal to g'(Z)
    return num/dem

  cost = -dot(log(A), Y) - dot(log(1-A), 1-Y)
    return cost

def onehot(y):
"""Returns Hot Encoding"""
    y_vals = max(y)+1
    onehot = eye(y_vals)[y]
    return squeeze(onehot).T


In [None]:
### Initialize model's parameters
def init_params(layers_dims):
    random.seed(3)               
    parameters = {}
    L = len(layers_dims)            

    for l in range(1, L):           
        parameters["W" + str(l)] = random.randn(
          layers_dims[l], layers_dims[l - 1]) * 0.01
        parameters["b" + str(l)] = zeros((layers_dims[l], 1))
        assert parameters["W" + str(l)].shape == (layers_dims[l], layers_dims[l-1])
        assert parameters["b" + str(l)].shape == (layers_dims[l], 1)
    return parameters


def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    for l in range(1, L+1):
        parameters["W" + str(l)] = parameters[
            "W" + str(l)] - learning_rate * grads["dW" + str(l)]
        parameters["b" + str(l)] = parameters[
            "b" + str(l)] - learning_rate * grads["db" + str(l)]
    return parameters

In [None]:
def linear_forward(A_prev, W, b):
    Z = dot(W, A_prev) + b
    cache = (A_prev, W, b)
    return Z, cache


def linear_activation_forward(A_prev, W, b, activation_fn=sigmoid):
    Z, linear_cache = linear_forward(A_prev, W, b)
    A, activation_cache = sigmoid(Z)

    assert A.shape == (W.shape[0], A_prev.shape[1])

    cache = (linear_cache, activation_cache)
    return A, cache


def L_model_forward(X, parameters, activation=sigmoid):
    A = []
    A = X                           
    caches = []                     
    L = len(parameters) // 2        
    
    for l in range(1, L):
        A_prev = A
        A, cache = linear_activation_forward(
            A_prev, parameters["W" + str(l)], parameters["b" + str(l)],
            activation_fn=activation)
        caches.append(cache)

    AL, cache = linear_activation_forward(
        A, parameters["W" + str(L)], parameters["b" + str(L)],
        activation_fn=sigmoid)
    caches.append(cache)

    # print(A.shape)
    # print(AL.shape)
    # print(X.shape[1])
    # print((1, X.shape[1]))
    assert AL.shape == (5, X.shape[1])
    return AL, caches


def forward_prop(x, params,activation=sigmoid):
    A = x # input to first layer i.e. training data
    caches = []
    L = len(params)//2
    for l in range(1, L):
        A_prev = A
        
    # Linear transformation 
    Z = dot(params['W'+str(l)], A_prev) + params['b'+str(l)] 
        
    # storing the both linear and activation cache
    cache = (A_prev, Z)
    caches.append(cache)

    # Applying activation function on linear transformation
    A  = activation(Z) 
    print(A)
    return A, caches

In [None]:
# Backward Propagation

def sigmoid_gradient(dA, Z):
    A, Z = sigmoid(Z)
    dZ = dA * A * (1 - A)

    return dZ

def linear_backword(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = (1 / m) * dot(dZ, A_prev.T)
    db = (1 / m) * sum(dZ, axis=1, keepdims=True)
    dA_prev = dot(W.T, dZ)

    assert dA_prev.shape == A_prev.shape
    assert dW.shape == W.shape
    assert db.shape == b.shape

    return dA_prev, dW, db



def linear_activation_backward(dA, cache, activation_fn):
    linear_cache, activation_cache = cache
    dZ = sigmoid_gradient(dA, activation_cache)
    dA_prev, dW, db = linear_backword(dZ, linear_cache)

    return dA_prev, dW, db


def L_model_backward(AL, y, caches, activation=sigmoid):
    y = y.reshape(AL.shape)
    L = len(caches)
    grads = {}

    dAL = divide(AL - y, multiply(AL, 1 - AL))

    grads["dA" + str(L - 1)], grads["dW" + str(L)], grads[
        "db" + str(L)] = linear_activation_backward(
            dAL, caches[L - 1], sigmoid)

    for l in range(L - 1, 0, -1):
        current_cache = caches[l - 1]
        grads["dA" + str(l - 1)], grads["dW" + str(l)], grads[
            "db" + str(l)] = linear_activation_backward(
                grads["dA" + str(l)], current_cache, activation)

    return grads


def backward_prop(AL, Y, caches, params, loss_function=Square_loss, activation=sigmoid):
    grads = {}
    L = len(caches)
    AL1, ZL = caches[L-1]
    delta_L = loss_function(AL, Y, derivative=True) * activation(ZL, derivative=True)
    grads['db'+str(L)] = delta_L
    grads['dW'+str(L)] = array([delta_L]).T @ array([AL1])

    for l in range(L-1, 0, -1):
        delta_curr  = delta_prev
        A_prev, Z_curr = caches[l]
        W_curr  = params['W'+str(l+2)]
        delta_prev = dot(W_curr.T, delta_curr) * activation(Z_curr, derivative=True)
        grads['db'+str(l+1)] = delta_prev
        grads['dW'+str(l+1)] = array([delta_prev]).T  @ array([A_prev])

        return grads

In [None]:
# Compute cross-entropy cost
def compute_cost(AL, y):
    m = y.shape[1]              
    cost = - (1 / m) * sum(
        multiply(y, log(AL)) + multiply(1 - y, log(1 - AL)))
    return cost

In [None]:
# training

def train_NN(Xtrain,Ytrain_vec, epochs, layer_dim, 
             learning_rate, cost_function, activation_function):
  # Step 2 initialize gradient based algorithm
    params = init_params(layer_dim)
    N      = Xtrain.shape[1]
    cost_span = zeros(epochs)
    cost_list = []

  # Step 3: training the neural network
    arr = arange(N)
    for i in range(epochs):
        l_rate   = learning_rate[i]
        random.shuffle(arr)
        cost_i = 0
    for j in arr:
        Y_hat, caches = L_model_forward(Xtrain, params, 
                                   activation = activation_function) # sub step 1

        cost = compute_cost(Y_hat, Ytrain_vec)

        grads = L_model_backward(Y_hat, Ytrain_vec, caches, activation = activation_function) #sub step 2

        params = update_parameters(params, grads, l_rate) # sub step 3

        if (i + 1) % 100 == 0:
            print(f"The cost after {i + 1} iterations is: {cost:.4f}")

        if i % 100 == 0:
            cost_list.append(cost)

    cost_span[i] = cost_i/N  
    #print(['At epochs ', i, 'the cost is ', cost_i/N])
    return params, cost_span

def prediction(params, Xtest):
    N = Xtest.shape[0]
    Ypred = zeros(N)
    for j in range(N):
        Y_hat, caches = forward_prop(Xtest[j], params, activation = activation_function)
        Ypred[j] = argmax(Y_hat)
    return Ypred

In [None]:
n_class = unique(y_train)
N_train = X_train.shape[0]
X_train1 = X_train.reshape(N_train, -1).T # num_feat x num_images
y_train1 = y_train.reshape(-1, N_train) 

INPUT_LAYER = X_train1.shape[0]
OUTPUT_LAYER = len(unique(y_train1))
DEPTH = 5
NEURONS = 5
layer_dim = [INPUT_LAYER, DEPTH, NEURONS, OUTPUT_LAYER]
Y_train_vec = onehot(y_train1) # num_images x num_classes 

cost_function       = Logistic_loss #Logistic_loss
activation_function = sigmoid

epochs = 100
epochs_span = arange(epochs)
learning_rate = 1/((epochs_span+1)**0.1)

params = init_params(layer_dim)
# X_train.shape
print(params.keys())

In [None]:
#dot(params['W1'], X_train1).shape
X_train1.shape
Y_train_vec.shape

In [None]:
params_Log, cost_span_Log = train_NN(X_train1,Y_train_vec, epochs, layer_dim, 
                                     learning_rate,cost_function, 
                                     activation_function)


In [None]:
params_Log,
cost_span_Log

In [None]:
def accuracy(X, parameters, y, activation_fn="relu"):
    probs, caches = L_model_forward(X, parameters, activation_fn)
    labels = (probs >= 0.5) * 1
    accuracy = mean(labels == y) * 100
    return f"The accuracy rate is: {accuracy:.2f}%."

N_test = X_test.shape[0]

X_test1 = X_test.reshape(N_test, -1).T # num_feat x num_images
y_test1 = y_test.reshape(-1, N_test) 

accuracy(X_test1, params_Log, y_test1, activation_fn="sigmoid")