## 1. Import Related Modules and Packages

In [1]:
import tensorflow as tf
tfe = tf.contrib.eager
# Enable eager execution mode
tf.enable_eager_execution()

import numpy as np
import matplotlib.pyplot as plt
import time

## 2. Hyperparameters

In [2]:
# A time step inputs one row of a image
# Number of time steps = number of rows
TIME_STEPS = 28
# Input  size = number of columns
INPUT_SIZE = 28
# Size of hidden states / RNN units
HIDDEN_SIZE = 50
# Output size = number of classes
OUTPUT_SIZE = 10

# Batch size
BATCH_SIZE = 50
# Number of epoches
NUM_EPOCH = 1

# Learning rate
LEARNING_RATE = 0.001

## 3. Prepare for Data

### 3.1. Load Data

In [3]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data(path='mnist.npz')

print('Training set:')
print('       Data :\t shape:', np.shape(x_train), '\t type:', x_train.dtype)
print('       Label:\t shape:', np.shape(y_train), '\t\t type:', y_train.dtype)
print('Testing set :')
print('       Data :\t shape:', np.shape(x_test), '\t type:', x_test.dtype)
print('       Label:\t shape:', np.shape(y_test), '\t\t type:', y_test.dtype)

Training set:
       Data :	 shape: (60000, 28, 28) 	 type: uint8
       Label:	 shape: (60000,) 		 type: uint8
Testing set :
       Data :	 shape: (10000, 28, 28) 	 type: uint8
       Label:	 shape: (10000,) 		 type: uint8


### 3.2. Data Preprocessing

In [4]:
# cast to float and standardize to [0,1]
x_train = x_train.astype(np.float32)/255
x_test = x_test.astype(np.float32)/255
print(np.shape(x_train), np.shape(x_test))

# transform lables to ont-hot vectors
y_train = tf.one_hot(y_train, 10, dtype=tf.float32)
y_test = tf.one_hot(y_test, 10, dtype=tf.float32)
print(y_train, y_test)

(60000, 28, 28) (10000, 28, 28)
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]], shape=(60000, 10), dtype=float32) tf.Tensor(
[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(10000, 10), dtype=float32)


### 3.3. Generate Dataset

In [5]:
# Generate Dataset
TrainDataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
# Shuffle per buffer_size
TrainDataset = TrainDataset.shuffle(buffer_size=5000)
# Batch size
TrainDataset = TrainDataset.batch(BATCH_SIZE, drop_remainder=True)

TestDataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(BATCH_SIZE, drop_remainder=True)

## 4. Build RNN

### 4.1. Build RNN

In [6]:
class Model(tf.keras.Model):
    def __init__(self, hidden_size, output_size):
        super(Model, self).__init__()
        self.units = hidden_size
        
        self.rnn = tf.keras.layers.SimpleRNN(units = hidden_size,   # units = HIDDEN_SIZE
                                             unroll = True,         # Allocate and fix the shape 
                                             # of inputs, eg [NUM_BATCH, TIME_STEPS, INPUT_SIZE]
                                             # in our case. Unrolling can speed-up a RNN, although 
                                             # it tends to be more memory-intensive. Unrolling is 
                                             # only suitable for short sequences.
                                             stateful = False,      # The last state of current 
                                             # batched samples won't be used as the initial state 
                                             # in the next batch. Only use when samples are 
                                             # truncated successively from a long sequences.
                                             return_sequences = False# Only return the last state.
                                            )

        self.fc = tf.keras.layers.Dense(output_size)
        # Here our model just outputs logits, to return the probability, add following codes:
        #    self.softmax = tf.keras.layers.Softmax()

    def call(self, inputs):

        # output at last time step
        # output shape == (BATCH_SIZE, HIDDEN_SIZE) 
        output = self.rnn(inputs)

        # The dense layer will output predictions for last time step
        # output shape after the dense layer == (BATCH_SIZE, OUTPUT_SIZE)
        prediction = self.fc(output)
        
        # To return the probability, add following codes:
        #    prediction = self.softmax(predictions)
        return prediction

### 4.2. Loss Function

In [7]:
def Loss(y_pred, y_true):
    '''
    Input:
        y_pred - [BATCH_SIZE, NUM_CLASS]
        y      - [BATCH_SIZE, NUM_CLASS]
    '''    
    # Here we use 'tf.losses.softmax_cross_entropy' since labels are one-hot
    # vectors. Use 'sparse_softmax_cross_entropy' if labels arn't one-hot 
    # vectors, i.e., y has shape [BATCH_SIZE, 1].
    return tf.losses.softmax_cross_entropy(onehot_labels=y_true, logits=y_pred)

### 4.3. Accuracy Evaluator

In [8]:
def Accuracy(y_pred, y_true):
    '''
    Input:
        y_pred - [BATCH_SIZE, NUM_CLASS]
        y      - [BATCH_SIZE, NUM_CLASS]
    '''
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_true,1), tf.argmax(y_pred,1)),tf.float32))
    return accuracy

### 4.4. Optimizer

In [9]:
# Using adam optimizer with default arguments
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)

### 4.4. Training

In [10]:
model = Model(HIDDEN_SIZE, OUTPUT_SIZE)

# Build the model so that 
#     1. it can be printed by model.summary() 
#     2. reset_states() can be called
# In tf 1.11, we can directly call model.build(INPUT_SHAPE) to 
# build the model, but in early version, we have to feed a 
# dummy input to build it:
#    dummy_x = tf.zeros((BATCH_SIZE, TIME_STEPS, INPUT_SIZE))
#    model._set_inputs(dummy_x)

# For unroll=True:
model.build((BATCH_SIZE, TIME_STEPS, INPUT_SIZE))
# For unroll=False:
#    model.build((None, TIME_STEPS, INPUT_SIZE))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       multiple                  3950      
_________________________________________________________________
dense (Dense)                multiple                  510       
Total params: 4,460
Trainable params: 4,460
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Training loop
for epoch in range(NUM_EPOCH):
    start = time.time()
    
    # initializing the hidden state at the start of every epoch
    # initally hidden is None
    hidden = model.reset_states()
    
    for (batch, (x, y)) in enumerate(TrainDataset):
        with tf.GradientTape() as tape:
            # feeding the hidden state back into the model
            # This is the interesting step
            y_pred = model(x)
            loss = Loss(y_pred, y)

        grads = tape.gradient(loss, model.variables)
        optimizer.apply_gradients(zip(grads, model.variables))

        if batch % 500 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,
                                                          batch,
                                                          loss))
           
    print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
    print ('Time taken for 1 epoch {} sec'.format(time.time() - start))
    
    
    # Testing at the end of every epoch
    
    # For unroll=True:
    cost = []
    accuracy = []    
    for x, y in TestDataset:
        y_pred = model(x)
        cost.append(Loss(y_pred, y))
        accuracy.append(Accuracy(y_pred, y))
    # For unroll=False:
    #    y_pred = model(x_test)
    #    cost = Loss(y_pred, y_test)
    #    accuracy = Accuracy(y_pred, y_test)
    print('Test Cost: ', np.mean(cost))
    print('Test Accuracy: ', np.mean(accuracy))

Epoch 1 Batch 0 Loss 2.4299
Epoch 1 Batch 500 Loss 0.7754
Epoch 1 Batch 1000 Loss 0.4583
Epoch 1 Loss 0.3785
Time taken for 1 epoch 33.06204104423523 sec
Test Cost:  0.4297131
Test Accuracy:  0.8733
