In [7]:
import numpy as np

a = np.array([[4, 2, 3],[2, 0, 1]])

b = np.transpose(a)

c = np.dot(a,b)

d = np.dot(b,a)

print(c)
print(d)

#element-wise multiplication must have a and b as the same dimensions

[[29 11]
 [11  5]]
[[20  8 14]
 [ 8  4  6]
 [14  6 10]]


In [8]:
#deep learning is all about matrix multiplication and element-wise multiplication

In [9]:
pinv_a = np.linalg.pinv(a)

print(pinv_a)
print(np.dot(a, pinv_a))
print(np.dot(pinv_a, a))

[[-0.08333333  0.58333333]
 [ 0.41666667 -0.91666667]
 [ 0.16666667 -0.16666667]]
[[ 1.00000000e+00  7.77156117e-16]
 [-2.77555756e-17  1.00000000e+00]]
[[ 0.83333333 -0.16666667  0.33333333]
 [-0.16666667  0.83333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]]


In [10]:
import numpy as np
a = np.array([[1, 2],[3, 4]])
b = np.array([[5, 6],[7, 8]])
print(np.multiply(a,b))
print(np.multiply(b,a))
print(a*b)
print(b*a)

[[ 5 12]
 [21 32]]
[[ 5 12]
 [21 32]]
[[ 5 12]
 [21 32]]
[[ 5 12]
 [21 32]]


In [11]:
# Matrix multiplication is not commutative, AB is different from BA

In [12]:
import numpy as np

# check this out:
# https://www.analyticsvidhya.com/blog/2017/05/neural-network-from-scratch-in-python-and-r/
# Input array
X=np.array([[1,0,1,0],[1,0,1,1],[0,1,0,1]])

#Output
y=np.array([[1],[1],[0]])


#Sigmoid Function
def sigmoid (x):
    return 1/(1 + np.exp(-x))


#Derivative of Sigmoid Function
def derivatives_sigmoid(x):
    return x * (1 - x)



#Variable initialization
epoch=5000 #Setting training iterations
lr=0.1 #Setting learning rate
inputlayer_neurons = X.shape[1] #number of features in data set
hiddenlayer_neurons = 3 #number of hidden layers neurons
output_neurons = 1 #number of neurons at output layer

#weight and bias initialization
# w1
wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
#w2
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))


for i in range(epoch):
    #Forward Propogation
    hidden_layer_input1=np.dot(X,wh)
    hidden_layer_input=hidden_layer_input1 + bh
    hiddenlayer_activations = sigmoid(hidden_layer_input)
    output_layer_input1=np.dot(hiddenlayer_activations,wout)
    output_layer_input= output_layer_input1+ bout
    output = sigmoid(output_layer_input)

    #Backpropagation
    D = y-output
    slope_output_layer = derivatives_sigmoid(output)
    slope_hidden_layer = derivatives_sigmoid(hiddenlayer_activations)
    d_output = D * slope_output_layer
    Error_at_hidden_layer = d_output.dot(wout.T)
    d_hiddenlayer = Error_at_hidden_layer * slope_hidden_layer
    wout += hiddenlayer_activations.T.dot(d_output) *lr
    bout += np.sum(d_output, axis=0,keepdims=True) *lr
    wh += X.T.dot(d_hiddenlayer) *lr
    bh += np.sum(d_hiddenlayer, axis=0,keepdims=True) *lr

print(output)

[[0.98263475]
 [0.96727247]
 [0.04497071]]


In [13]:
 # nonconvex function means a line has many dips / valleys and it can be hard to find the global minimum to minimize the errors in the weights/ balances (?)


How do we update the weights to minimize the error?

First we should define the cost function. for our example here the MSE is our cost function:

$E= \frac{1}{2} ({\bf y}_t - {\bf y}_p)^T ({\bf y}_t - {\bf y}_p)$

We update the weight (${\bf W}_i$ and ${\bf W}_h$) such that the error, $E$, being minimized. The most popular algorithm is Gradient Descent:

${\bf W}_h = {\bf W}_h + \eta {\partial E}/{\partial {\bf W}_h} $

For our above example we can show that:

${\partial E}/{\partial {\bf W}_h} = ({\bf y}_t - {\bf y}_p) {\bf y}_p (1 - {\bf y}_p)\bf {h}$

where ${\bf h} = \sigma({\bf W}_i {\bf x}_i + {\bf b}_i)$

In above code:

$D = {\bf y}_t - {\bf y}_p$

${\bf y}_p (1 - {\bf y}_p)$ = slope_hidden_layer

$\bf {h}$ = hiddenlayer_activations

