In [7]:
import numpy as np

### 1. Numpy implimentation of Gradient Descent

- Let's say our cost function is sum of squared error devided by 2(for calculation convience): $$E = \frac{1}{2}\sum_\mu(y^{\mu}-\hat{y}^{\mu})^2$$
- $$E = \frac{1}{2}\sum_\mu(y^{\mu}-\hat{y}^{\mu}) = \frac{1}{2}\sum_\mu(y^{\mu}-f(\sum_iw_ix_{i}^{mu}))^2$$
- $\mu$ is the number of records, $i$ is the number of variables 
- let first take a look at the gradient for one record
- now, take the derative of out cost function in terms of $w_i$:
$$\frac{d}{dw_i}=-(y-\hat{y})f'(h)x_i$$
- $f'(h)$ is the derative of the activation function, $\eta$ is the learning rate
- $$\Delta w_i = \eta(y-\hat{y})f'(h)x_i$$
- we define error term: $$\delta = (y-\hat{y})f'(h)$$
- now our weight update is: $$w_i = w_i + \eta\delta x_i$$


#### Now let's implement it 

In [8]:
# Defining the sigmoid function for activations
def sigmoid(x):
    return 1/(1+np.exp(-x))     # is it basically what used in logistic regression 
# Derivative of the sigmoid function
def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))


##### calssification of school admition 

In [9]:
import numpy as np
from data_prep import features, targets, features_test, targets_test

print('features size:',features.shape)
print('target size:', targets.shape)

features size: (360, 6)
target size: (360,)


in real world, we will vectorized this, but for now, let's just iterate through them 

In [10]:
# Use to same seed to make debugging easier
np.random.seed(42)
n_records, n_features = features.shape                                ## get number of observation and number of variables 
last_loss = None                                                      ## set cost function to be none as initiation 
## it is common to initiat random small weights as 1 / n_features**.5
weights = np.random.normal(scale=1 / n_features**.5, size=n_features) # Initialize weights
# Neural Network hyperparameters
epochs = 1000           ## number of iteration                                              
learnrate = 0.5         ## learning rate 

############################################
## this is the process runs gradient decent#
############################################
for e in range(epochs):                        # run 1000 iterations
    del_w = np.zeros(weights.shape)
    for x, y in zip(features.values, targets): # Loop through all records, x is the input, y is the target
        output = sigmoid(np.dot(x,weights))          # Calculate the output
        error = y-output                             # calculate the error
        error_term = error * output * (1 - output)   # error term \eta
        del_w += error_term*x                        # and add it to the total weight change
        ## del_w is a vector of 6 items 
        ## end if inner loop
        
    weights += learnrate * del_w / n_records         # take the average change and update it 
    
    # Printing out the mean square error on the training set
    if e % (epochs / 5) == 0:   ## print out 5 results 
        out = sigmoid(np.dot(features, weights))
        loss = np.mean((out - targets) ** 2)
        if last_loss and last_loss < loss:
            print("Train loss: ", loss, "  WARNING - Loss Increasing")
        else:
            print("Train loss: ", loss)
        last_loss = loss

Train loss:  0.26276093849966364
Train loss:  0.20084292908073417
Train loss:  0.19779851396686018
Train loss:  0.19723507746241067
Train loss:  0.19706766341315074


In [11]:
# Calculate accuracy on test data
tes_out = sigmoid(np.dot(features_test, weights))
predictions = tes_out > 0.5
accuracy = np.mean(predictions == targets_test)
print("Prediction accuracy: {:.3f}".format(accuracy))

Prediction accuracy: 0.725


### Backpropagation

- for multi-layer backpropagation, the error term, based on chain rule is : 
$$\delta^h_j = \sum W_{jk}\delta^0f'(h_j)$$
$$\Delta w_{pq} = \eta \delta_{output}V_{in}$$
- for detailed explaination please see: https://www.youtube.com/watch?v=59Hbtz7XgjM

##### Implementation 

<img src='data/backprop.png' width=80% height=80%>

#### In real world we may want to vectorize this as well 

In [14]:
from data_prep import features, targets, features_test, targets_test
np.random.seed(21)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
# Derivative of the sigmoid function
def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

# Hyperparameters
n_hidden = 2        # number of hidden units
epochs = 5500
learnrate = 0.005   

n_records, n_features = features.shape                               # get feature dimention
last_loss = None
# Initialize weights
weights_input_hidden = np.random.normal(scale=1 / n_features ** .5,
                                        size=(n_features,  ))
weights_hidden_output = np.random.normal(scale=1 / n_features ** .5,
                                         size=n_hidden)

## run the network 
for e in range(epochs):
    del_w_input_hidden = np.zeros(weights_input_hidden.shape)       # delta w starts with zero
    del_w_hidden_output = np.zeros(weights_hidden_output.shape)
    for x, y in zip(features.values, targets):
        ## Forward pass ##
        hidden_input = x.dot(weights_input_hidden)
        hidden_output = sigmoid(hidden_input)
        output = sigmoid(np.dot(hidden_output,weights_hidden_output))
        
        ## Backward pass##
        error = y - output                              ## predicted error
        ## calculate error term
        output_error_term = error*sigmoid_prime(np.dot(hidden_output,weights_hidden_output)) 
        # you can also do this output_error_term = error * output * (1 - output), will probably save some computing 
        # but using is sigmoid_prime is more clear to me, as we are using chain rule to get the gradient
        
        ## calculate hidden error term
        hidden_error_term = np.dot(output_error_term, weights_hidden_output)* sigmoid_prime(hidden_input)
        ## same thing here, can use the reulst of hidden_output 
        
        ## Update the change in weights
        del_w_hidden_output += output_error_term * hidden_output
        del_w_input_hidden += hidden_error_term * x[:, None] ## x need to be a column vector
       ##end of loop
    
    # TODO: Update weights
    weights_input_hidden += learnrate * del_w_input_hidden / n_records
    weights_hidden_output += learnrate * del_w_hidden_output / n_records
    
        # Printing out the mean square error on the training set
    if e % (epochs / 5) == 0:
        hidden_output = sigmoid(np.dot(x, weights_input_hidden))
        out = sigmoid(np.dot(hidden_output,
                             weights_hidden_output))
        loss = np.mean((out - targets) ** 2)

        if last_loss and last_loss < loss:
            print("Train loss: ", loss, "  WARNING - Loss Increasing")
        else:
            print("Train loss: ", loss)
        last_loss = loss

###########################################
# Calculate accuracy on test data
hidden = sigmoid(np.dot(features_test, weights_input_hidden))
out = sigmoid(np.dot(hidden, weights_hidden_output))
predictions = out > 0.5
accuracy = np.mean(predictions == targets_test)
print("Prediction accuracy: {:.3f}".format(accuracy))

Train loss:  0.27630002065852294
Train loss:  0.2611609415133075
Train loss:  0.2502600961884271
Train loss:  0.24247242010582226
Train loss:  0.23690102376400965
Prediction accuracy: 0.750


In [16]:
from data_prep import features, targets, features_test, targets_test

In [17]:
features.shape

(360, 6)

In [18]:
targets.shape

(360,)