In [None]:
import os, gc
os.chdir( '/Users/user/ownCloud/' )

In [None]:
import warnings ; warnings.simplefilter( "ignore" )
import numpy as np, pandas as pd, flex as fl
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
mnist_npz = '/Users/user/study_notes/year_14_15/spring_2015/machine_learning/data/mldata/mnist_scikit.npz'
assert( os.path.exists( mnist_npz ) )
with np.load( mnist_npz, 'r' ) as npz :
    mnist_labels, mnist_data = np.asarray( npz[ 'labels' ], np.int ), npz[ 'data' ] / 255.0
mnist_classes = np.unique( mnist_labels )

In [None]:
from sklearn import *

In [None]:
random_state = np.random.RandomState( None )

In [None]:
split_ = cross_validation.train_test_split( mnist_data, mnist_labels, test_size = 0.75,
                                            random_state = random_state )
X_train, X_test, y_train, y_test = split_

In [None]:
print X_train.shape
print X_test.shape

Draw and plot some random subset of the train dataset.

In [None]:
index_perm_ = np.random.permutation( X_train.shape[ 0 ] )
X = X_train[ index_perm_[ : 400 ] ]

In [None]:
axis = plt.figure( figsize = ( 16, 9 ) ).add_subplot( 111 )
axis.set_title( u"200 random digits MNIST train dataset." )
fl.plot( axis, X, n = 200, shape = ( 28, -1 ), cmap = plt.cm.hot, interpolation = "nearest" )
plt.show( )

## Simple average pattern classifier

Generate averaged images

In [None]:
train_digits_ = { label_ : X_train[ np.flatnonzero( y_train == label_ ) ]
                           for label_ in mnist_classes }

avg_patterns_ = { label_ : samples_.mean( axis = 0 )[ np.newaxis ]
                  for label_, samples_ in train_digits_.iteritems() }

## Construct an aligned pair of arrays
average_digits = np.concatenate( [ array_ for array_ in avg_patterns_.itervalues() ], axis = 0 )
average_labels = np.array( [ label_ for label_ in avg_patterns_.iterkeys() ], dtype = np.int )
del avg_patterns_

## Sort them
order_ = np.argsort( average_labels )
average_labels = average_labels[ order_ ]
average_digits = average_digits[ order_ ]

In [None]:
## Plot the patterns
axis = plt.figure( figsize = ( 16, 9 ) ).add_subplot( 111 )
axis.set_title( u"Averaged digits on train" )
fl.plot( axis, average_digits, shape = ( 28, -1 ), cmap = plt.cm.hot, interpolation = "nearest" )
plt.show( )

A simple minimal error calssifier based on mean patterns.
$\newcommand{\argmin}{\mathop{\mathtt{argmin}}} \newcommand{\Dcal}{\mathcal{D}}$
Basically this is a simple gaussian model:
$$ \log \mathcal{L} = - \frac{|\Dcal|}{2} \log 2\pi - \frac{|\Dcal|}{2} \log \sigma^2
                      - \frac{1}{2 \sigma^2} \sum_{x\in \Dcal} ( x - \mu_{k_x} )^2 \,. $$

Classification is done with this rule:
\begin{align*}
    \hat{y}(x)
        & = \argmin_{k=1,\ldots,K} \, \sum_{n=1}^N \sum_{m=1}^M ( x_{nm} - \mu_{knm} )^2 \\
        & = \argmin_{k=1,\ldots,K} \, \sum_{n=1}^N \sum_{m=1}^M - 2 x_{nm} \mu_{knm} + \mu_{knm}^2 \\
        & = \argmin_{k=1,\ldots,K} \, \|\mu_k\|^2 - 2 \langle x, \mu_k \rangle \,,
\end{align*}
where
$$ \langle f, g \rangle  = \sum_{n=1}^N \sum_{m=1}^M f_{nm} g_{nm}\,. $$

In [None]:
## Compute the squared error ...
yp_ = - 2 * np.tensordot( X_test, average_digits, axes = [ -1, -1 ] ) \
      + average_digits.dot( average_digits.T )[:1]
## ... and based on it find the closest label.
pred_labels_ = average_labels[ yp_.argmin(axis = 1) ]

Construct the confusion matrix

In [None]:
import scipy as sp
tbl_ = sp.sparse.coo_matrix( ( np.ones_like( pred_labels_ ), ( pred_labels_, y_test ) ) ).todense( )
tbl = pd.DataFrame( tbl_, index = average_labels, columns = average_labels )
tbl.index.name = "Predicted" ; tbl.columns.name = "Actual" ;

tbl

Compute the accuracy

In [None]:
print "Achieved accuracy is %.3f%%" % ( np.mean( y_test == pred_labels_ ) * 100.0, )

## Random Forest WTF!

Initalize and fit a random forest to the train dataset.

In [None]:
rfc_ = ensemble.RandomForestClassifier( n_estimators = 50, n_jobs = -1,
                                        random_state = random_state ).fit( X_train, y_train )

Predict on the test dataset.

In [None]:
pred_labels_ = rfc_.classes_[ rfc_.predict_proba( X_test ).argmax( axis = 1 ) ]

Display the accuracy ...

In [None]:
print "Achieved accuracy is %.3f%%" % ( np.mean( y_test == pred_labels_ ) * 100.0, )

... and the confusion matrix.

In [None]:
import scipy as sp
tbl_ = sp.sparse.coo_matrix( ( np.ones_like( pred_labels_ ), ( pred_labels_, y_test ) ) ).todense( )
tbl = pd.DataFrame( tbl_, index = average_labels, columns = average_labels )
tbl.index.name = "Predicted" ; tbl.columns.name = "Actual" ;

tbl

In [None]:
## Plot the patterns
axis = plt.figure( figsize = ( 16, 9 ) ).add_subplot( 111 )
axis.set_title( u"Random Forest feature importances" )
fl.plot( axis, rfc_.feature_importances_[ np.newaxis ], n_row = 1, n_col = 1,
         shape = ( 28, -1 ), cmap = plt.cm.hot, interpolation = "nearest" )
plt.show( )

## PCA

Let's use PCA to learn a linear manifold from the data. Scipy's SVD returns $U$, $\Sigma$ and $V'$ (not $V$).

In [None]:
scl_ = preprocessing.StandardScaler( ).fit( X_train )
U, S, V = sp.linalg.svd( scl_.transform( X_train ), full_matrices = False )
order_ = S.argsort( )[::-1]
U, S, V = U[:,order_], S[ order_ ], V[order_]

Compute variance within each principal direction.

In [None]:
var_ = S**2
leverage_ = np.cumsum( var_ ) / np.sum( var_ )
n_components = np.min( np.flatnonzero( leverage_ > 0.95 ) )

print """The least number of principal components required to""" \
    + """guarantee at least 95%% recostruction is %d.""" % ( n_components, ) 

plt.plot( leverage_ )
plt.axhline( y = 0.95, color = 'k', lw = 2 )



Chose the number of components.

In [None]:
n_components = 256

Extract the principal components

In [None]:
pc_ = U[:,:n_components] * S[ :n_components ]

Embed the compnents in the original space.

In [None]:
X_ = scl_.inverse_transform( np.dot( pc_, V[:n_components] ) )

The eigenvectors are

In [None]:
axis = plt.figure( figsize = ( 16, 9 ) ).add_subplot( 111 )
axis.set_title( u"Principal components of train dataset" )
fl.plot( axis, X_, n = 512, shape = ( 28, -1 ), cmap = plt.cm.hot, interpolation = "nearest" )
plt.show( )


See how well it fares on the test dataset.

In [None]:
pc_ = np.dot( scl_.transform( X_test ), V[:n_components].T )
X_ = scl_.inverse_transform( np.dot( pc_, V[:n_components] ) )

In [None]:
axis = plt.figure( figsize = ( 16, 9 ) ).add_subplot( 111 )
axis.set_title( u"Principal components of the test dataset" )
fl.plot( axis, X_, n = 512, shape = ( 28, -1 ), cmap = plt.cm.hot, interpolation = "nearest" )
plt.show( )

<hr/>

In [None]:
import lasagne, theano.tensor as T

Implement as simple neural network with lasagne

In [None]:
input_var = T.tensor4( 'inputs' )
target_var = T.ivector( 'targets' ) 

In [None]:
## The input layer
network = lasagne.layers.InputLayer( shape = ( None, 1, 28, 28 ),
                                     input_var = input_var )

## The 2D 5x5 conv layer with ReLU and 2x2 max-pooling
## (28-7+1) // 1
network = lasagne.layers.Conv2DLayer( network, num_filters = 32, filter_size = ( 5, 5 ),
                                      nonlinearity = lasagne.nonlinearities.rectify,
                                      border_mode = "valid", W = lasagne.init.GlorotUniform( ) )
network = lasagne.layers.MaxPool2DLayer( network, pool_size = ( 2, 2 ) )

## The 2D 3x3 conv layer with ReLU and 2x2 max-pooling
network = lasagne.layers.Conv2DLayer( lasagne.layers.dropout( network, p = .2 ),
                                      num_filters = 64, filter_size = ( 3, 3 ),
                                      nonlinearity = lasagne.nonlinearities.rectify,
                                      border_mode = "valid", W = lasagne.init.GlorotUniform( ) )
network = lasagne.layers.MaxPool2DLayer( network, pool_size = ( 2, 2 ) )

## FC layer with dropout
network = lasagne.layers.DenseLayer( lasagne.layers.dropout( network, p = .5 ),
                                     num_units = 256, nonlinearity = lasagne.nonlinearities.rectify )

network = lasagne.layers.DenseLayer( lasagne.layers.dropout( network, p = .5 ),
                                     num_units = 10, nonlinearity = lasagne.nonlinearities.softmax )


In [None]:
prediction = lasagne.layers.get_output( network )
loss = lasagne.objectives.categorical_crossentropy( prediction, target_var ).mean( )

In [None]:
params = lasagne.layers.get_all_params( network, trainable = True )
updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate = 0.01, momentum = 0.9 )

In [None]:
test_prediction = lasagne.layers.get_output( network, deterministic = True )
test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var ).mean( )

test_acc = T.mean( T.eq( T.argmax( test_prediction, axis = 1 ), target_var ), dtype = theano.config.floatX )

In [None]:
train_fn = theano.function([input_var, target_var], loss, updates=updates)
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])