In [2]:
import numpy as np
import os
import pickle

Load data from pickle to memory.

In [3]:
# Assumes data have been downloaded extracted and saved to pickle

filename_train_images = 'data/mnist_train_images'
filename_train_labels = 'data/mnist_train_labels'

def getPickleFilename(filename):
    return filename + '.pickle'

def loadFromPickle(filename):
    if not os.path.isfile(getPickleFilename(filename)):
        return np.array([])
    file = open(getPickleFilename(filename), 'rb')
    data = pickle.load(file)
    file.close()
    return data

images = loadFromPickle(filename_train_images)
labels = loadFromPickle(filename_train_labels)

print(images.shape)
print(labels.shape)

(60000, 28, 28)
(60000,)


Neural networks offers an alternate way to perform machine learning when we have complex hypotheses with many features.

$
X = 
\begin{bmatrix}
x_{11} & \cdots & x_{1n} \\
\vdots & \ddots & \\
x_{m1} & & x_{mn} \\
\end{bmatrix}
$
$
,\qquad W_i = [\text{number of rows in layer i}]\,\times\,n, \qquad i \in 1,...,l
$ 

$
\begin{bmatrix}
x_1 \\
\vdots \\
x_n
\end{bmatrix}
$
=>
$
\begin{bmatrix}
a_1 \\
\vdots \\
a_{n1}
\end{bmatrix}
$
=>
$
h=\displaystyle\sum_{i=1}^{n1}w2_{i}a_{i}
$

$
Y = 
\begin{bmatrix}
y_{11} & \cdots & y_{1c} \\
\vdots & \ddots & \\
y_{m1} & & y_{mc} \\
\end{bmatrix}
$

In [96]:
def h(W, X):
    #print(X.shape)
    #print(W.shape)
    return 1 / (1 + np.exp(-X.dot(W.T)))

def gradient(W, Y, X):
    #np.sum((h() - Y) * X, axis=0, keepdims=True).T
    return (X.T).dot(h(W, X) - Y) / m

def gradient_descent(W, Y, X):
    return W - alpha * gradient(W, Y, X)

def loss(W, Y, X):
    D = h(W, X) - Y
    return  float(D.T.dot(D) / m)

#m = images.shape[0]
m = 1
n = images.shape[1]**2
X = images[0:m].reshape(m, n) # m X n
X = np.insert(X, 0, 1, axis=1)
Y = labels[0:m]
W = np.zeros(n + 1)
alpha = 0.000001

# Need Y for each classes
c = 10 # amount of classes
Ys = np.zeros((m,c))
for i in range(0, m):
    Ys[i][Y[i]] = 1

# TODO random initialize weights
Ws = np.zeros((c,n+1))

num_layers = 1
alpha = 0.0001
for i in range(0, m):
    # forward propagation
    # for j in range(0, num_layers):
    #print(Ws.shape)
    #print(Ws[0])
    O = h(Ws,X[i])
    #print(O)
    #print(Ys[i])
    diff = O - Ys[i]
    #print(diff)
    der = diff[np.newaxis].T * X[i]
    #print(der.shape)
    Ws = Ws - alpha * der

print(h(Ws, X))
print(Ys)

[[  2.80105469e-093   1.45716829e-054   8.64392802e-184   2.18218506e-072
    6.44685898e-081   1.42968779e-023   1.03373349e-129   1.03373349e-129
    1.03373349e-129   3.62580867e-167]
 [  1.00000000e+000   8.68885068e-061   1.53168433e-086   1.19156404e-019
    1.13063052e-052   5.64347268e-218   1.21569638e-076   1.21569638e-076
    1.21569638e-076   7.71015263e-100]
 [  1.03907275e-165   3.28876600e-026   5.61134630e-061   5.13492805e-009
    1.00000000e+000   1.13999873e-025   1.44752780e-023   1.44752780e-023
    1.44752780e-023   9.35352574e-043]
 [  3.20429664e-160   1.00000000e+000   1.68198689e-062   3.01574405e-017
    1.41647651e-022   1.01702531e-028   1.38206431e-035   1.38206431e-035
    1.38206431e-035   3.25523877e-048]
 [  4.99821195e-056   9.38046641e-102   1.94789391e-094   5.66715514e-078
    7.16273785e-118   3.83515357e-063   3.21241794e-042   3.21241794e-042
    3.21241794e-042   1.00000000e+000]
 [  2.24663431e-083   1.57814237e-037   1.00000000e+000   3.54575