# Neural networks basics

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
from IPython.display import Markdown, display, HTML

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

**Task 1.** Write a method for performing feed forward for a network with 2 input neurons, 3 hidden neurons with ReLU as the activation function, and 1 output neuron with no activation function. The interface of the method should be as follows:

    feed_forward_1(x, w1, w2)
    
where x is a numpy array of size 1x2, w1 is a numpy array of size 3x2 of weights from the input layer to the hidden layer, w2 is a numpy array of size 1x3. The result should be a single number.

As a helper function you can write the relu function:

    relu(x)
   
which takes a numpy array as input and applies the ReLU function element-wise.

Test it on 2-3 examples, calculate the result also with pen and paper by yourself (it's very important to do it at least once in your life!) and compare your results with the function results.

In [2]:
# EXAMPLE
# def relu(x):
#     return np.maximum(x, 0)

# def feed_forward_1(x, w1, w2):
#     h = relu(np.matmul(w1, x))
#     y = np.matmul(w2, h)
#     return y

def relu(x):
    return [element if element > 0 else 0 for element in x]

def feed_forward_1(x, w1, w2):
    return np.dot(w2, relu(np.dot(w1, x)))

# Test
display(feed_forward_1(np.array([1,-2]), np.array([[1,2],[3,4],[5,6]]),np.array([[2,2,2]])))


array([0])

**Task 2.** Write a method for performing feed forward for a network with 2 input neurons, 3 hidden neurons with sigmoid activation function, and 1 output neuron with sigmoid activation function. The interface of the method should be as follows:

    feed_forward_2(x, w1, w2)
    
where x is a numpy array of size 1x2, w1 is a numpy array of size 3x2 of weights from the input layer to the hidden layer, w2 is a numpy array of size 1x3. The result should be a single number.

As a helper function you can write the sigmoid function:

    sigmoid(x)
   
which takes a numpy array as input and applies the logistic function element-wise.

Test it on 2-3 examples. Try to estimate the result by hand and compare both results.

In [3]:
# EXAMPLE
# def sigmoid(x):
#     return 1 / (1 + np.exp(-x))

# def feed_forward_2(x, w1, w2):
#     h = sigmoid(np.matmul(w1, x))

#     y = sigmoid(np.matmul(w2, h))
#     return y

def sigmoid(x):
    return 1/(1 + np.exp(-x))

def feed_forward_2(x, w1, w2):
    return sigmoid(np.dot(w2, sigmoid(np.dot(w1, x))))


# Test
display(feed_forward_2(np.array([1,2]), np.array([[1,2],[3,4],[5,6]]),np.array([[2,2,2]])))


array([0.99749406])

**Task 3.** Write a method for performing feed forward for a network with 2 input neurons, 3 hidden neurons with relu activation, and 3 output neurons with no activation function but softmax applied to them. The interface of the method should be as follows:

    feed_forward_3(x, w1, w2)
    
where x is a numpy array of size 1x2, w1 is a numpy array of size 3x2 of weights from the input layer to the hidden layer, w2 is a numpy array of size 3x3. The result should be a numpy array of size 1x3.

As a helper function you can write the softmax function:

    softmax(x)
   
which takes a numpy array as input, applies softmax to it and returns a numpy array of the same size.

Test it on 2-3 examples. Try to estimate the result by hand and compare both results. Verify that the sum of output neurons is equal to 1.

In [4]:
# EXMPLE
# def relu(x):
#     return np.maximum(x, 0)

# def softmax(x):
#     return np.exp(x) / np.sum(np.exp(x))

# def feed_forward_3(x, w1, w2):
#     h = relu(np.matmul(w1, x))
#     y = softmax(np.matmul(w2, h))
#     return y

# https://www.delftstack.com/howto/numpy/numpy-softmax/
def relu(x):
    return [element if element > 0 else 0 for element in x]


def softmax(x):
    return np.exp(x) / np.sum(np.exp(x),axis=0)


def feed_forward_3(x, w1, w2):
    h = relu(np.dot(w1, x))   # wejście * wagi_1 = pierwsza warstwa  + relu activation (spr czy >= 0) 
    y = softmax(np.dot(w2, h)) # wejscie warstwy_1 * wagi_2 = druga warstwa = 3 oczekiwane neurony na wyjsciu (bez relu ale z softmax - zgodnie z trescia))
    return y


# Test I
x = np.array([1, 2])
w1 = np.array([[-1, 1],
               [-0.5, 2.5],
               [0, 3]])
w2 = np.array([[-1, 2.0, 1.0],
               [1, 0, -1],
               [2, 3, -0.5]])
print(feed_forward_3(x, w1, w2))

# Test II
x = np.array([-1, 1])
w1 = np.array([[2, -1],
               [-5, -2],
               [1, 3]])
w2 = np.array([[-1, -2, 1],
               [1, 0, -1],
               [2, -3, 0.5]])
print(feed_forward_3(x, w1, w2))

# Test III
x = np.array([1, 2])
w1 = np.array([[-1, 1],
               [0.5, -2],
               [0, 3]])
w2 = np.array([[-1, 2, 1],
               [1, 0, -1],
               [2, -3, 0.5]])
print(feed_forward_3(x, w1, w2))


[8.17574472e-01 4.58070334e-09 1.82425523e-01]
[0.11894324 0.87887824 0.00217852]
[4.99988650e-01 2.26994496e-05 4.99988650e-01]


**Task 4.** Write a method for calculating the squared error:

    se(y_bar, y)
    
where y_bar is a numpy array of predicted results and y is a numpy array of real values to be predicted.

Test it on the 2-3 examples you created for feed_forward_1 and find the SE with respect to real values chosen by hand. Experiment with several values to get a grip of how SE works.

In [5]:
# EXAMPLE
# def se(y_bar, y):
#     return np.sum(np.power(y_bar - y, 2))

# mse error
def se(y_bar, y):
     return (np.square(y_bar - y)).mean()

# Test
print(se(-1, 2))
print(se(1, -3))
print(se(-4, -2))
print(se(3, 5))

9.0
16.0
4.0
4.0


**Task 5.** Write a method for calculating the cross-entropy loss:

    crossentropy(y_bar, y)
    
where y_bar is a numpy array of predicted results and y is a numpy array of real values to be predicted (y must contain one value of 1 and 0 on all other positions - just like in one-hot encoding).

Test it on the 2-3 examples you created for feed_forward_3 and find the cross-entropy with respect to vectors of real values chosen by hand. Experiment with several target vectors to get a grip of how cross-entropy works.

In [42]:
# https://androidkt.com/implement-softmax-and-cross-entropy-in-python-and-pytorch/
# https://stackoverflow.com/questions/47377222/what-is-the-problem-with-my-implementation-of-the-cross-entropy-function
def cross_entropy(predictions, targets, epsilon=1e-12):
    """
    Computes cross entropy between targets (encoded as one-hot vectors)
    and predictions. 
    Input: predictions (N, k) ndarray
           targets (N, k) ndarray        
    Returns: scalar
    """
    predictions = np.clip(predictions, epsilon, 1. - epsilon)
    N = predictions.shape[0]
    ce = -np.sum(targets*np.log(predictions+1e-9))/N
    return ce

def crossentropy(y_bar, y):
    y_1 = np.log(y_bar[y==1])
    y_0 = np.log(1 - y_bar[y == 0])

    loss = np.sum(y_1) + sum(y_0)

    return -loss
    
# Test
y=np.array([0,0,1])

y_pre_good=np.array([0.1,0.1,0.8])
y_pre_bed=np.array([0.8,0.1,0.1])

display(crossentropy(y_pre_good, y))
display(crossentropy(y_pre_bed, y))

# Test I
x = np.array([1, 2])
w1 = np.array([[-1, 1],
               [-0.5, 2.5],
               [0, 3]])
w2 = np.array([[-1, 2, 1],
               [1, 0, -1],
               [2, -3, 0.5]])
print(crossentropy(feed_forward_3(x, w1, w2), np.array([1.0, 0.0, 0.0])))

# Test II
x = np.array([1, 2])
w1 = np.array([[-1, 1],
               [-0.5, 2.5],
               [0, 3]])
w2 = np.array([[-1, 2, 1],
               [1, 0, -1],
               [2, -3, 0.5]])
print(crossentropy(feed_forward_3(x, w1, w2), np.array([0.0, 1.0, 0.0])))

# Test III
x = np.array([1, 2])
w1 = np.array([[-1, 1],
               [-0.5, 2.5],
               [0, 3]])
w2 = np.array([[-1, 2, 1],
               [1, 0, -1],
               [2, -3, 0.5]])
print(crossentropy(feed_forward_3(x, w1, w2), np.array([0.0, 0.0, 1.0])))

0.43386458262986227

4.017383521085972

1.154397247699546e-08
37.97024958896534
41.47024959439894


**Task 6.** Write a method which calculates the error term for network 2:

    error_term(x, w1, w2, y)
    
where x is a numpy array of size 1x2, w1 is a numpy array of size 3x2 of weights from the input layer to the hidden layer, w2 is a numpy array of size 1x3, y is a float.

The solution is extremely simple - it's just the difference between y and the result of feed_forward_2. This task is solely to bring this notion to your attention.

Test it on the 2-3 examples you created for feed_forward_2 and several values of y (remember that the values of y should logically be in the interval [0-1], because you used the sigmoid activation which sends the network's results into this interval, technically turning them into probabilities).

In [7]:
def error_term(x, w1, w2, y):
    # w1 - waights vector for first layout
    # w2 - weights vector for second layout
    # y - walue
    return y - feed_forward_2(x, w1, w2)

# Test I
x = np.array([1, 2])
w1 = np.array([[-1, 1],
               [0.5, -2],
               [0, 3]])
w2 = np.array([-1, 2, 1])
print(error_term(x, w1, w2, 0.5))

# Test II
x = np.array([-1, 1])
w1 = np.array([[2, -1],
               [-5, -2],
               [1, 3]])
w2 = np.array([1, 2, 1])
print(error_term(x, w1, w2, 0.5))

# Test III
x = np.array([1, 2])
w1 = np.array([[-1, 1],
               [0.5, -2],
               [0, 3]])
w2 = np.array([-1, -2, -1])
print(error_term(x, w1, w2, 0.5))

-0.08056501459743926
-0.4444527266801629
0.35658492484751114


**Task 7.** Write a method which calculates the derivative of the sigmoid function on every element of a numpy array:

    sigmoid_derivative(x)
    
where x is a numpy array. The result should have the same size as the input.

Use the formula:

<center>
$$
    \sigma'(x) = \sigma(x) (1 - \sigma(x))
$$
</center>

Verify this formula by differentiating the sigmoid function by hand.

Test the sigmoid_derivative method on a numpy array with several values (e.g. -3, -2, -1, 0, 1, 2, 3).

In [8]:
# EXAMPLE
# def sigmoid(x):
#     return 1 / (1 + np.exp(-x))

# def sigmoid_derivative(x):
#     return sigmoid(x) * (1 - sigmoid(x))

# policzyc na kartce pochodna
def sigmoid(x):
    return 1/(1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

# Test
print(sigmoid_derivative(np.array([-3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0])))

[0.04517666 0.10499359 0.19661193 0.25       0.19661193 0.10499359
 0.04517666]


**Task 8.** Write a method for performing the backpropagation step for network 2. The method should have the following interface:

    backpropagate(x, w1, w2, y, alpha)
    
where x is a numpy array of size 1x2, w1 is a numpy array of size 3x2 of weights from the input layer to the hidden layer, w2 is a numpy array of size 1x3, y is a float, alpha is the learning rate. The method should return a tuple with updated matrices w1 and w2.

Test it on 2-3 examples. Test several values of the learning rate alpha on the same input to see how it affects the update step.

In [16]:
# EXAMPLE
# def backpropagate(x, w1, w2, y, alpha):
#     lin1 = np.matmul(w1, x)
#     h = sigmoid(lin1)
#     lin2 = np.matmul(w2, h)
#     y_net = sigmoid(lin2)
#     error_term = y - y_net
    
#     w1 += 2 * alpha * error_term * sigmoid_derivative(lin2) \
#         * np.matmul((w2 * sigmoid_derivative(lin1)).reshape(3, 1), (x.reshape(1, 2)))
#     w2 += 2 * alpha * error_term * sigmoid_derivative(lin2) * h
    
    
#     return w1, w2

def sigmoid(x):
    return 1/(1 + np.exp(-x))

# pochodna funckji sigmoid: e' = e(x) * (1 - e(x))
def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

# obliczenie X'tej warstwy sieci
def feed_forward(w_X, x):
    layer = np.dot(w_X, x)
    h = sigmoid(layer)

    return layer, h

def error_term(y, feed_foward):
    return y - feed_foward

def backpropagate(x, w1, w2, y, alpha):
    layer_1, h = feed_forward(w1, x)   # calculate neurons values on first layer : w^(1) * x
    layer_2, y_predicted = feed_forward(w2, h)   # calculate neuron values on second layer: w^(2) * h
    
    error_term_score = error_term(y, y_predicted)   # calculate error
    
    # 2 * alfa * [result generated in last layer] * [previous layer result] (continue from n, n-1, ... to 1 layer )
    # -2 * e(w*x) * e'(w^x * h) * h
    #
    w_x_derivative = np.dot(
                            (w2 * sigmoid_derivative(layer_1)).reshape(3, 1),
                            (x.reshape(1, 2))
                           )
    
    w1 += 2 * alpha * error_term_score * sigmoid_derivative(layer_2) * w_x_derivative
    
    w2 += 2 * alpha * error_term_score * sigmoid_derivative(layer_2) * h
    
    
    return w1, w2


# Test
x = np.array([1.0, 2.0])
w1 = np.array([[-1.0, 1.0],
               [0.5, -2],
               [0.0, 3.0]])
w2 = np.array([-1.0, 2.0, 1.0])

print(feed_forward_2(x, w1, w2))
print()

y = -0.78
alpha = 0.1
for _ in range(10):
    w1, w2 = backpropagate(x, w1, w2, y, alpha)
    print(w1)
    print(w2)
    print(feed_forward_2(x, w1, w2))
    print()

0.5805650145974393

[[-9.86972092e-01  1.02605582e+00]
 [ 4.96229289e-01 -2.00754142e+00]
 [-1.63435940e-04  2.99967313e+00]]
[-1.04844143  1.99805771  0.9339018 ]
0.5521595151335615

[[-9.73804803e-01  1.05239039e+00]
 [ 4.92549709e-01 -2.01490058e+00]
 [-3.15319668e-04  2.99936936e+00]]
[-1.09743683  1.99616156  0.86818172]
0.5231787430565403

[[-9.60643225e-01  1.07871355e+00]
 [ 4.88984236e-01 -2.02203153e+00]
 [-4.54768343e-04  2.99909046e+00]]
[-1.14659236  1.99432343  0.80332383]
0.49405460583983957

[[-9.47629028e-01  1.10474194e+00]
 [ 4.85552880e-01 -2.02889424e+00]
 [-5.81257317e-04  2.99883749e+00]]
[-1.19550609  1.9925537   0.73978796]
0.465228374281893

[[-9.34890815e-01  1.13021837e+00]
 [ 4.82271500e-01 -2.03545700e+00]
 [-6.94643334e-04  2.99861071e+00]]
[-1.24379456  1.99086061  0.6779813 ]
0.43711299579491325

[[-9.22536617e-01  1.15492677e+00]
 [ 4.79151098e-01 -2.04169780e+00]
 [-7.95145920e-04  2.99840971e+00]]
[-1.29111617  1.98924993  0.61823694]
0.4100612926570

**Task 9.** Write a method for performing the backpropagation step for network 2 but without the activation function on the output neuron.

In [20]:
# EXAMPLE
# def backpropagate_2(x, w1, w2, y, alpha):
#     lin1 = np.matmul(w1, x)
#     h = sigmoid(lin1)
#     lin2 = np.matmul(w2, h)
#     y_net = lin2
#     error_term = y - y_net
    
#     w1 += 2 * alpha * error_term \
#         * np.matmul((w2 * sigmoid_derivative(lin1)).reshape(3, 1), (x.reshape(1, 2)))
#     w2 += 2 * alpha * error_term * h
    
#     return w1, w2

def sigmoid(x):
    return 1/(1 + np.exp(-x))

# pochodna funckji sigmoid: e' = e(x) * (1 - e(x))
def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

# obliczenie X'tej warstwy sieci
def feed_forward(w_X, x):
    layer = np.dot(w_X, x)
    h = sigmoid(layer)

    return layer, h

def error_term(y, feed_foward):
    return y - feed_foward

def backpropagate_2(x, w1, w2, y, alpha):
    layer_1, h = feed_forward(w1, x)   # calculate neurons values on first layer : w^(1) * x
    
    layer_2 = np.dot(w2, h)   # calculate neuron values on second layer: w^(2) * h
    y_predicted = layer_2
    
    error_term_score = error_term(y, y_predicted)   # calculate error
    
    # 2 * alfa * [result generated in last layer] * [previous layer result] (continue from n, n-1, ... to 1 layer )
    # -2 * e(w*x) * e'(w^x * h) * h
    #
    w_x_derivative = np.dot(
                            (w2 * sigmoid_derivative(layer_1)).reshape(3, 1),
                            (x.reshape(1, 2))
                           )
    
    w1 += 2 * alpha * error_term_score * w_x_derivative
    
    w2 += 2 * alpha * error_term_score * h
    
    
    return w1, w2


# Test
x = np.array([1.0, 2.0])
w1 = np.array([[-1.0, 1.0],
               [0.5, -2],
               [0.0, 3.0]])
w2 = np.array([-1.0, 2.0, 1.0])

print(feed_forward_2(x, w1, w2))
print()

y = -1.78
alpha = 0.1
for _ in range(10):
    w1, w2 = backpropagate_2(x, w1, w2, y, alpha)
    print(w1)
    print(w2)
    print(feed_forward_2(x, w1, w2))
    print()

0.5805650145974393

[[-9.17222709e-01  1.16555458e+00]
 [ 4.76041492e-01 -2.04791702e+00]
 [-1.03844642e-03  2.99792311e+00]]
[-1.3077893   1.987659    0.58002237]
0.3961531435802377

[[-8.61311534e-01  1.27737693e+00]
 [ 4.62319805e-01 -2.07536039e+00]
 [-1.42915648e-03  2.99714169e+00]]
[-1.52633568  1.98057063  0.30900161]
0.282377597489662

[[-8.27376672e-01  1.34524666e+00]
 [ 4.54329777e-01 -2.09134045e+00]
 [-1.55923318e-03  2.99688153e+00]]
[-1.66947481  1.97643552  0.13996329]
0.2211118293044698

[[-8.07154165e-01  1.38569167e+00]
 [ 4.49611811e-01 -2.10077638e+00]
 [-1.59547148e-03  2.99680906e+00]]
[-1.75964213  1.97399105  0.03606263]
0.18817010420338567

[[-7.95079013e-01  1.40984197e+00]
 [ 4.46798104e-01 -2.10640379e+00]
 [-1.60117471e-03  2.99679765e+00]]
[-1.81542842  1.9725322  -0.02738994]
0.16989497129212758

[[-7.87835073e-01  1.42432985e+00]
 [ 4.45109251e-01 -2.10978150e+00]
 [-1.59853757e-03  2.99680292e+00]]
[-1.84963696  1.9716562  -0.06601905]
0.1594512143048