# Linear regression

$n=[\text{number of features}]$  
$m=[\text{sample size}]$  
$x=[\text{input value}]$  
$w=[\text{input weight}]$

$\displaystyle h=\sum_{i=1}^{n}w_ix_i$  
$\displaystyle l=\frac{1}{m}\sum_{i=1}^{m}(h(x_i)-y_i)^2$  
$\displaystyle l'_w=\frac{2}{m}\sum_{i=1}^{m}(h(x_i)-y_i)x_{ij}, \quad j \in 1,...,n$  
$\displaystyle w_j=w_j - \frac{\alpha}{m}\sum_{i=1}^{m}(h(x_i)-y_i)x_{ij}, \quad j \in 1,...,n$

### vectorization

$H=X \cdot W$

$\nabla{l}=\frac{1}{m}(X^T\cdot(H(X)-Y))$

$W = W - \alpha\nabla{l}$

In [66]:
import numpy as np

W = np.array([0,0,0])[np.newaxis].T

X = np.array([
        [1,3,2],
        [1,4,5],
        [1,7,4],
        [1,2,4]
    ])

Y = np.array([4,5,2,1])[np.newaxis].T

m = Y.shape[0]
alpha = 0.01

def h():
    return X.dot(W)

def gradient():
    return (X.T).dot(h() - Y) / m

def gradient_descent():
    return W - alpha * gradient()

def loss():
    D = h() - Y
    return  float(D.T.dot(D) / m)

#X = np.insert(X, 0, 1, axis=1)

print(W)
print(loss())
W = gradient_descent()
print(W)
print(loss())
W = gradient_descent()
print(W)
print(loss())
W = gradient_descent()
print(W)
print(loss())

[[0]
 [0]
 [0]]
11.5
[[ 0.03  ]
 [ 0.12  ]
 [ 0.1125]]
6.8560703125
[[ 0.05068125]
 [ 0.1979625 ]
 [ 0.18811875]]
4.824798589638672
[[ 0.06520148]
 [ 0.24817416]
 [ 0.23934591]]
3.9340881937891554


# Logistic regression

$\sigma=\displaystyle\frac{1}{1+e^{-z}}$

$\sigma'=\sigma(z)(1-\sigma(z))$

$h = \displaystyle\sum_{i=1}^{n}\sigma(w_{i}x_i)$

$l = \displaystyle\frac{1}{m}\sum_{i=1}^{m}-y\log(h(x_i)) - (1 - y)\log(1-h(x_i))$

$l_j' = \displaystyle\frac{1}{m}\sum_{i=1}^{m}(h(x_i)-y_i)x_{ij}, \quad j \in 1,...,n$

$\displaystyle w_j = w_j - \frac{\alpha}{m}\sum_{i=1}^{m}(h(x_i)-y_i)x_{ij}, \quad j \in 1,...,n$

### Vectorization

$H(X)=\displaystyle\frac{1}{1+e^{-X\cdot{W}}}$

$\nabla{l}=\frac{1}{m}(X^T\cdot(H(X)-Y))$

$W = W - \alpha\nabla{l}$

In [22]:
# Assumes data have been downloaded extracted and saved to pickle

import os
import pickle

filename_train_images = 'data/mnist_train_images'
filename_train_labels = 'data/mnist_train_labels'

def getPickleFilename(filename):
    return filename + '.pickle'

def loadFromPickle(filename):
    if not os.path.isfile(getPickleFilename(filename)):
        return np.array([])
    file = open(getPickleFilename(filename), 'rb')
    data = pickle.load(file)
    file.close()
    return data

images = loadFromPickle(filename_train_images)
labels = loadFromPickle(filename_train_labels)

print(images.shape)
print(labels.shape)

(60000, 28, 28)
(60000,)
(60000, 784)
(60000, 1)


In [166]:
def h(W, X):
    return 1 / (1 + np.exp(-X.dot(W)))

def gradient(W, Y, X):
    #np.sum((h() - Y) * X, axis=0, keepdims=True).T
    return (X.T).dot(h(W, X) - Y) / m

def gradient_descent(W, Y, X):
    return W - alpha * gradient(W, Y, X)

def loss(W, Y, X):
    # Wrong cost function! Needs log!
    D = h(W, X) - Y
    return  float(D.T.dot(D) / m)

#m = images.shape[0]
m = 10
n = images.shape[1]**2
X = images[0:m].reshape(m, n) # m X n
X = np.insert(X, 0, 1, axis=1)
Y = labels[0:m][np.newaxis].T # c X m X 1
W = np.zeros(n + 1)[np.newaxis].T # c X n X 1
alpha = 0.000001

# Need Y for each classes
c = 10 # amount of classes
Ys = np.zeros((c,m,1))
for i in range(0, m):
    Ys[Y[i][0]][i][0] = 1
  
Ws = np.zeros((c,n+1,1))

iterations = 5

losses = np.zeros(c)
for i in range(0, c):
    for j in range(0, iterations):
        Ws[i] = gradient_descent(Ws[i], Ys[i], X)
    losses[i] = loss(Ws[i], Ys[i], X)
    
print(losses)
Hs = np.zeros(10)
for i in range(0, 1):
    print(h(Ws, X[0]))
    
print(Y[0:10])

[ 0.05266539  0.07547815  0.07102239  0.08753784  0.06196712  0.06691913
  0.01071672  0.01071672  0.01071672  0.07081272]
[[ 0.13484228]
 [ 0.16009833]
 [ 0.10420013]
 [ 0.1775378 ]
 [ 0.10109601]
 [ 0.32565459]
 [ 0.05538395]
 [ 0.05538395]
 [ 0.05538395]
 [ 0.08104953]]
[[5]
 [0]
 [4]
 [1]
 [9]
 [2]
 [1]
 [3]
 [1]
 [4]]


# Neural networks
Neural networks offers an alternate way to perform machine learning when we have complex hypotheses with many features.

$
X = 
\begin{bmatrix}
x_{11} & \cdots & x_{1n} \\
\vdots & \ddots & \\
x_{m1} & & x_{mn} \\
\end{bmatrix}
$
$
,\qquad W_i = [\text{number of rows in layer i}]\,\times\,n, \qquad i \in 1,...,l
$ 

$
\begin{bmatrix}
x_1 \\
\vdots \\
x_n
\end{bmatrix}
$
=>
$
\begin{bmatrix}
a_1 \\
\vdots \\
a_{n1}
\end{bmatrix}
$
=>
$
h=\displaystyle\sum_{i=1}^{n1}w2_{i}a_{i}
$

In [167]:
def h(W, X):
    return 1 / (1 + np.exp(-X.dot(W)))

def gradient(W, Y, X):
    #np.sum((h() - Y) * X, axis=0, keepdims=True).T
    return (X.T).dot(h(W, X) - Y) / m

def gradient_descent(W, Y, X):
    return W - alpha * gradient(W, Y, X)

def loss(W, Y, X):
    D = h(W, X) - Y
    return  float(D.T.dot(D) / m)

#m = images.shape[0]
m = 10
n = images.shape[1]**2
X = images[0:m].reshape(m, n) # m X n
X = np.insert(X, 0, 1, axis=1)
Y = labels[0:m][np.newaxis].T # c X m X 1
W = np.zeros(n + 1)[np.newaxis].T # c X n X 1
alpha = 0.000001

# Need Y for each classes
c = 10 # amount of classes
Ys = np.zeros((c,m,1))
for i in range(0, m):
    Ys[Y[i][0]][i][0] = 1
  
Ws = np.zeros((c,n+1,1))

iterations = 5

losses = np.zeros(c)
for i in range(0, c):
    for j in range(0, iterations):
        Ws[i] = gradient_descent(Ws[i], Ys[i], X)
    losses[i] = loss(Ws[i], Ys[i], X)
    
print(losses)
Hs = np.zeros(10)
for i in range(0, 1):
    print(h(Ws, X[0]))
    
print(Y[0:10])

[ 0.05266539  0.07547815  0.07102239  0.08753784  0.06196712  0.06691913
  0.01071672  0.01071672  0.01071672  0.07081272]
[[ 0.13484228]
 [ 0.16009833]
 [ 0.10420013]
 [ 0.1775378 ]
 [ 0.10109601]
 [ 0.32565459]
 [ 0.05538395]
 [ 0.05538395]
 [ 0.05538395]
 [ 0.08104953]]
[[5]
 [0]
 [4]
 [1]
 [9]
 [2]
 [1]
 [3]
 [1]
 [4]]
