In [2]:
import numpy as np

import cPickle,gzip
import matplotlib.pyplot as plt

import theano
from theano import tensor as T
from theano import function,grad,pp,shared

In [3]:
f = gzip.open('mnist.pkl.gz','rb')
train_set,valid_set,test_set = cPickle.load(f)
f.close()
def share_dataset(data_xy):
    data_x,data_y = data_xy
    shared_x = shared(np.array(data_x,dtype=theano.config.floatX))
    shared_y = shared(np.array(data_y,dtype=theano.config.floatX))
    return shared_x,T.cast(shared_y,'int32')

train_set_x,train_set_y = share_dataset(train_set)
valid_set_x,valid_set_y = share_dataset(valid_set)
test_set_x,test_set_y = share_dataset(test_set)

In [4]:
class LogisticRegression(object):
    def __init__(self,x,n_in,n_out):
        self.W = theano.shared(
            value=np.zeros(
                (n_in,n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        self.b = theano.shared(
            value=np.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )
        self.p_y_given_x = T.nnet.softmax(T.dot(x,self.W)+self.b)
        self.y_pred = T.argmax(self.p_y_given_x,axis=1)
        self.params = [ self.W, self.b ]
        self.x = x
    
    def negative_log_likelihood(self,y):
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
    
    def errors(self,y):
        return T.mean(T.neq(self.y_pred,y))

In [5]:
class HiddenLayer(object):
    def __init__(self,x,n_in,n_out):
        self.x = x
        W_values = np.asarray(
                np.random.uniform(
                    low=-np.sqrt(6./(n_in + n_out)),
                    high=np.sqrt(6./(n_in + n_out)),
                    size=(n_in,n_out)
                ),
                dtype=theano.config.floatX
        )
        W = shared(value=W_values, name='W',borrow=True)
        b_values = np.zeros((n_out,),dtype=theano.config.floatX)
        b = shared(value=b_values,name='b',borrow=True)
        self.W = W
        self.b = b
        self.output = T.tanh(T.dot(x,self.W) + self.b)
        self.params = [ self.W, self.b ]
    
class MLP(object):
    def __init__(self,x,n_in,n_hidden,n_out):
        self.hidden_layer = HiddenLayer(
            x=x,n_in=n_in,n_out=n_hidden)
        self.logistic_regression_layer = LogisticRegression(
            x=self.hidden_layer.output,
            n_in=n_hidden,n_out=n_out)
        self.L1 = ( 
            abs(self.hidden_layer.W).sum() +
            abs(self.logistic_regression_layer.W).sum()
        )
        
        self.L2_sqr = (
            abs(self.hidden_layer.W ** 2).sum() +
            abs(self.logistic_regression_layer.W ** 2).sum()
        )
        
        self.negative_log_likelihood = ( 
            self.logistic_regression_layer.negative_log_likelihood
        )
        self.errors = self.logistic_regression_layer.errors
        
        self.params = self.hidden_layer.params + self.logistic_regression_layer.params
        self.x = x

In [16]:

alpha=0.1
batch_size=600
n_epochs=1000
n_hidden=500
L1_reg = 0.0
L2_reg = 0.0001
    
n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

i = T.lscalar('i')
x = T.matrix('x')
y = T.ivector('y')
    
classifier = MLP(x=x,n_in=28*28,n_hidden=500,n_out=10)
cost = (
    classifier.negative_log_likelihood(y)
    + L1_reg * classifier.L1
    + L2_reg * classifier.L2_sqr
)
    
test_model = function(
    inputs=[i],
    outputs=classifier.errors(y),
    givens=[
        (x, test_set_x[i*batch_size:(i+1)*batch_size]),
        (y, test_set_y[i*batch_size:(i+1)*batch_size])
    ]
)
    
gparams = [T.grad(cost, param) for param in classifier.params]
updates = [
    (param, param - alpha * gparam)
    for param, gparam in zip(classifier.params, gparams)
]
    
train_model = theano.function(
    inputs=[i],
    outputs=cost,
    updates=updates,
    givens={
        x: train_set_x[i * batch_size: (i + 1) * batch_size],
        y: train_set_y[i * batch_size: (i + 1) * batch_size]
    }
)

In [17]:
error_rates = []
for i in range(n_test_batches):
    error_rates.append(test_model(i))
print np.mean(error_rates)

for epoch in range(10):
    costs = [] 
    for i in range(n_train_batches):
        costs.append(train_model(i))
        
error_rates = []
for i in range(n_test_batches):
    error_rates.append(test_model(i))
print np.mean(error_rates)


0.902395833333
0.0783333333333
