In [1]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
import tensorflow as tf
import keras
print(keras.__version__)


Using TensorFlow backend.


2.2.4


# Load data

In [2]:
import mnist
x_train, t_train, x_test, t_test = mnist.load()

# Normalize the data and make one-hot array for each label

In [3]:
from keras.utils import to_categorical
xtrain=x_train/255
ttrain=to_categorical(t_train)

xtest=x_test/255
ttest=to_categorical(t_test)

print(xtrain.shape, ttrain[0].argmax())
print(xtest.shape, ttest[0].argmax())

(60000, 784) 5
(10000, 784) 7


# Import Keras and set the model
First hidden layer:
* We set the first hidden layer by calling Dense function, note that, we should set **input_dim** to denote the input dimension. In this case, each digit has 784 pixels, so the input dimention is 784.
*  We 'relu'(rectifier) as our activation function in the first hiden layer

Second hidden layer:
* We call Dropout(0.5) to drop out 50% imput for the second hidden layer. Dropout is a regularization technique which help to reduce overfitting.
* We set the second hidden layer with 30 neurons, and applied 'relu' activation.

Output layer:
* We set the output layer with 10 neurons (digit 0-9), and apply softmax on the output.


In [4]:

model = Sequential([
    Dense(80, input_dim=784),#input_shape=(784,)),
    Activation(tf.nn.relu),
    Dropout(0.5),
    Dense(30),
    Activation(tf.nn.relu),
    Dense(10),
    Activation(tf.nn.softmax)
])


# Model training 
* __SGD__:
Before compile the model, we we call SGD (Stochastic gradient descent) as our optimizer. 
    * *lr* is the leaning rate
    * *decay* is learning rate decay (over each epoch)
    * we set *momentum* 0.9 and set *nesterov* as true.
    
    Note: We set _momentum_ and _nesterov_ since SGD with momentum can overcome local optimum/push torward to global optimum, and Nesterov momentum result in good acceleration.


* __Model.compile__ is to compile the model:
    * We assign to *categorial_crossentropy*, which is adapted in multi-calss single-label classification. 
    * We set our optimizer to our sgd optomizer we create above. 
    * And we assign 'accuracy' to *metrics* to show the accuracy for each epoch.


* __Model.fit__: After compile our model, we now can fit our training data(xtrain) and labels(ttrain) to our model, and we set the batch size 128 and run 3 epochs total.

# result
We can see the time and accuracy of each epoch. As the result shows, the accuracy reachs 0.93 after the *8-th* epoch.


In [5]:
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])

model.fit(xtrain, ttrain,
          epochs=8,
          batch_size=128)


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7fe135cd5f28>

# Evaluation

We use **model.evaluate** to evaluate the model on testing data(xtest) and labels(ttest). The print out the score, the first item is the loss, which is 0.18, and the second item is accuracy which is 0.94. We spend 6 second to train the model with at least 90% accuracy, which is a good performance. Note that by tuning the parameters like learning rate, batch size or epochs, it is possible to accelerate the training process and get higher accuracy.

In [6]:
score = model.evaluate(xtest, ttest, batch_size=128)
print(score)
print("%s: %.2f" % (model.metrics_names[1], score[1]*100))

[0.12691179723739623, 0.96220000000000006]
acc: 96.22


# Other way to add layer in models

In [7]:
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 20-dimensional vectors.

#model = Sequential()
#model.add(Dense(64, activation='relu', input_dim=20))
#model.add(Dropout(0.5))
#model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.5))
#model.add(Dense(10, activation=tf.nn.softmax))