In [2]:
#%matplotlib inline
# IMPORTANT: need this line to show matplotlib plots!

# import necessary dependencies and MNIST dataset
#import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import warnings
from sklearn.exceptions import ConvergenceWarning

with warnings.catch_warnings():
  warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")


# load MNIST dataset
X_train, y_train = fetch_openml(name="mnist_784", return_X_y=True) # newest version is implicit. return_X_y parameter specifies returning labels as a separate vector.


testSize = 0.2 # could write trainSize = 0.8 as well. This value refers to the percent of data which is partitioned off into the test set and NOT used by the model.
randomState = 0  # ensure reproducibility of randomness

# Q4a. Here we split the data into 80% training, 20% training, using our random state "randomState" to ensure consistency of results
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=testSize, random_state=randomState)


print(X_train[0]) # Q4b(i) display first sample
print("Training set size: {}" .format(len(y_train))) # Q4b(ii) display number of samples in training set
print("Testing set size: {}" .format(len(y_test))) # Q4b(iii) display number of samples in testing set




[  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.  86. 254. 202.   6.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.  42. 181. 253. 132.   5.   0.   0.   0.
   0. 

In some instances (eg importing the same dataset from Keras) we may have to reshape the data, which is 3-dimensional (number of samples * image width * image height) into a 2-dimensional representation. Vectors of size (image_width * image_height) are constructed. In our dataset, both image_width and image_height are equal to 28, so the vectors will have size 784, with each entry in each vector carrying a value from 0-255.

This is because the MLPClassifier.fit() function only takes an array of shape (number of samples) * (number of features) and cannot parse 3-dimensional data. Here I will let number of features = number of pixels = 28*28 = 784

If we use scikit-learn for this dataset, however, this step is not needed.

In [3]:
print("X_train {}" .format(X_train.shape))
print("y_train {}" .format(y_train.shape))
print("X_test {}" .format(X_test.shape))
print("y_test {}" .format(y_test.shape))

#nsamples, nx, ny = X_train.shape
#X_train = X_train.reshape((nsamples,nx*ny))

#nsamples_t, nx_t, ny_t = X_test.shape
#X_test = X_test.reshape((nsamples_t,nx_t*ny_t))

#print("New X_train shape {}" .format(X_train.shape))
#print("New X_test shape {}" .format(X_test.shape))

X_train (56000, 784)
y_train (56000,)
X_test (14000, 784)
y_test (14000,)


Here we create our model, based off the Multi-Layer-Perceptron (MLP).

In [4]:
# we can use different types of functions as activation function for hidden layer, including ReLU, Sigmoid, hyperbolic tan, etc.
# I arbitrarily chose for there to be 2 hidden layers: 100 perceptrons in 1st, 30 in 2nd
# learning rate decreases slightly with each step if we choose learning_rate="invscaling", and constant as default
# max_iter denotes number of epochs (max number of passes through the entire set of data.) Too small and you underfit, too big and you overfit. (also, too big and the model takes too much time.)
maxIters=20
model = MLPClassifier(hidden_layer_sizes=(200,60), max_iter=maxIters, 
                      activation="relu", learning_rate="constant", 
                      random_state=1)

# fit data to the model
model.fit(X_train, y_train)

# Q4c(i) and Q4c(ii) display training/testing accuracy
print("Training accuracy: {}" .format(model.score(X_train, y_train)))
print("Testing accuracy: {}" .format(model.score(X_test, y_test)))



Training accuracy: 0.9859285714285714
Testing accuracy: 0.96


Here we double the number of perceptrons in each hidden layer.

In [5]:
model2 = MLPClassifier(hidden_layer_sizes=(400,120), max_iter=maxIters, 
                      activation="relu", learning_rate="constant", 
                      random_state=1)

# fit data to the model
model2.fit(X_train, y_train)

# display training/testing accuracy
print("Training accuracy: {}" .format(model2.score(X_train, y_train)))
print("Testing accuracy: {}" .format(model2.score(X_test, y_test)))



Training accuracy: 0.9924464285714286
Testing accuracy: 0.9690714285714286


Here we halve the number of perceptrons in each hidden layer.

In [6]:
model3 = MLPClassifier(hidden_layer_sizes=(100,30), max_iter=maxIters, 
                      activation="relu", learning_rate="constant", 
                      random_state=1)

# fit data to the model
model3.fit(X_train, y_train)

# display training/testing accuracy
print("Training accuracy: {}" .format(model3.score(X_train, y_train)))
print("Testing accuracy: {}" .format(model3.score(X_test, y_test)))



Training accuracy: 0.9802142857142857
Testing accuracy: 0.9565714285714285


From keeping all other parameters the same and only changing the number of nodes in each hidden layer, we can see that **increasing the number of nodes in each hidden layer raises the testing and training accuracy**, while **decreasing the number of nodes in each hidden layer lowers the training and testing accuracy.**