In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "deep"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

## Assignment 8.1 

Build a DNN with five hidden layers of 100 neurons each, He initialization, and the ELU activation function

In [None]:
import tensorflow as tf

n_inputs = 28 * 28
n_hidden1 = 100
n_hidden2 = 100
n_hidden3 = 100
n_hidden4 = 100
n_hidden5 = 100
n_outputs = 5

he_init = tf.contrib.layers.variance_scaling_initializer()

In [None]:
from functools import partial

dense_layer = partial(tf.layers.dense, activation=tf.nn.elu, kernel_initializer=he_init)

In [None]:
reset_graph()

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int64, shape=(None), name='y')

with tf.name_scope('dnn'):
    hidden1 = dense_layer(X, n_hidden1, name='hidden1')
    hidden2 = dense_layer(hidden1, n_hidden2, name='hidden2')
    hidden3 = dense_layer(hidden2, n_hidden3, name='hidden3')
    hidden4 = dense_layer(hidden3, n_hidden4, name='hidden4')
    hidden5 = dense_layer(hidden4, n_hidden5, name='hidden5')
    logits = dense_layer(hidden5, n_outputs, activation=None, name='outputs')
    
with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name='loss')

## Assignment 8.2

Using Adam optimization and early stopping, try training it on MNIST but only on digits 0 to 4, as we will use transfer learning for digits 5 to 9 in the next exercise. You will need a softmax output layer with five neurons, and as always make sure to save checkpoints at regular intervals and save the final model so you can reuse it later.

`Adam optimation`: adaptive moment estimation, combines the ideas of Momentum optimization and RMSProp.

`Momentum optimization`: keep track of an exponentially decaying average of past gradients.

`RMSProp`: keep track of an exponentially decaying of past squared gradients.

In [None]:
learning_rate = 0.001

with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

In [None]:
with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('/tmp/data/')

In [None]:
X_train = mnist.train.images[mnist.train.labels < 5]
y_train = mnist.train.labels[mnist.train.labels < 5]
X_test = mnist.test.images[mnist.test.labels < 5]
y_test = mnist.test.labels[mnist.test.labels < 5]

In [None]:
def shuffle_split(X, y, n_batches):
    np.random.seed(seed=42)
    rnd_idx = np.random.permutation(len(X))
    for i_idx in np.array_split(rnd_idx, n_batches):
        X_batch = X[i_idx]
        y_batch = y[i_idx]
        yield X_batch, y_batch

In [None]:
n_epochs = 50
batch_size = 50
n_batches = len(X_train) // batch_size
best_loss = float('inf')
patience = 2
cnt_patience = 0

init = tf.global_variables_initializer()

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_split(X_train, y_train, n_batches):
            sess.run([training_op, loss], feed_dict={X: X_batch, y: y_batch})
        accuracy_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        accuracy_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
        loss_test = loss.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, 'loss', loss_test, 'accuracy_train:', accuracy_train, 'accuracy_test:', accuracy_test)
        if loss_test < best_loss:
            best_loss = loss_test
        else:
            cnt_patience += 1
            if cnt_patience > patience:
                'Early stopping!'
                break

Alternatively we can use `tf.keras.callbacks.EarlyStopping`.

## Assignment 8.3

Tune the hyperparameters using cross-validation and see what precision you can achieve.

### Order to tune hyperparameters in Neural Networks

Answers from [stackoverflow](https://stackoverflow.com/questions/37467647/in-what-order-should-we-tune-hyperparameters-in-neural-networks):

**My general order is:**

- Batch size, as it will largely affect the training time of future experiments.

- Architecture of the network:

    - Number of neurons in the network
    - Number of layers

- Rest (dropout, L2 reg, etc.)

**Dependencies:**

I'd assume that the optimal values of

- learning rate and batch size
- learning rate and number of neurons
- number of neurons and number of layers

strongly depend on each other. I am not an expert on that field though.

**As for your hyperparameters:**

- For the Adam optimizer: "Recommended values in the paper are eps = 1e-8, beta1 = 0.9, beta2 = 0.999." (source)
- For the learning rate with Adam and RMSProp, I found values around 0.001 to be optimal for most problems.
- As an alternative to Adam, you can also use RMSProp, which reduces the memory footprint by up to 33%. See this answer for more details.
- You could also tune the initial weight values (see All you need is a good init). Although, the Xavier initializer seems to be a good way to prevent having to tune the weight inits.
- I don't tune the number of iterations / epochs as a hyperparameter. I train the net until its validation error converges. However, I give each run a time budget.

### Create custom estimator with scikit learn

The following refers to [scikit learn documentation](http://scikit-learn.org/dev/developers/contributing.html#rolling-your-own-estimator) and a blog article from [Daniel Hnyk](http://danielhnyk.cz/creating-your-own-estimator-scikit-learn/).

**Estimator types**

Some common functionality depends on the kind of estimator passed. For different tasks, you can choose:

- ClassifierMixin
- RegressorMixin
- ClusterMixin

**get_params and set_params**

All estimators must have `get_params` and `set_params` functions. They are inherited from `BaseEstimator`.

**Pipeline compatibility**

For an estimator to be usable together with `pipeline.Pipeline`in any but the last step, it needs to provide a `fit` or `fit_transform` function. 

To be able to evaluate the pipeline on any data but the training set, it also needs to provide a `transform` function.


In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import tensorflow as tf
from functools import partial
import numpy as np


class DnnClassifier(BaseEstimator, ClassifierMixin):
    
    
    def __init__(self, 
                 batch_size=50, 
                 n_neuron=100):
        
        self.batch_size = batch_size
        self.n_neuron = n_neuron
        
    def reset_graph(self, seed=42):
        tf.reset_default_graph()
        tf.set_random_seed(seed)
        np.random.seed(seed)
        
    def _build_graph(self, n_inputs, n_outputs):
        
        self.reset_graph()
        
        X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
        y = tf.placeholder(tf.int64, shape=(None), name='y')
        
        he_init = tf.contrib.layers.variance_scaling_initializer()
        
        dense_layer = partial(tf.layers.dense, activation=tf.nn.elu, kernel_initializer=he_init)

        with tf.name_scope('dnn'):
            hidden1 = dense_layer(X, self.n_neuron, name='hidden1')
            hidden2 = dense_layer(hidden1, self.n_neuron, name='hidden2')
            hidden3 = dense_layer(hidden2, self.n_neuron, name='hidden3')
            hidden4 = dense_layer(hidden3, self.n_neuron, name='hidden4')
            hidden5 = dense_layer(hidden4, self.n_neuron, name='hidden5')
            logits = dense_layer(hidden5, n_outputs, activation=None, name='outputs')
            
        with tf.name_scope('softmax'):
            y_proba = tf.nn.softmax(logits, axis=1, name='y_proba')
    
        with tf.name_scope('loss'):
            xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
            loss = tf.reduce_mean(xentropy, name='loss')
        
        learning_rate = 0.001

        with tf.name_scope('train'):
            optimizer = tf.train.AdamOptimizer(learning_rate)
            training_op = optimizer.minimize(loss)
            
        with tf.name_scope('eval'):
            correct = tf.nn.in_top_k(logits, y, 1)
            accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
            
        self._training_op = training_op
        self._accuracy = accuracy
        self._X = X
        self._y = y
        self._logits = logits
        self._y_proba = y_proba
        
    def shuffle_split(self, X, y, n_batches):
        np.random.seed(seed=42)
        rnd_idx = np.random.permutation(len(X))
        for i_idx in np.array_split(rnd_idx, n_batches):
            X_batch = X[i_idx]
            y_batch = y[i_idx]
            yield X_batch, y_batch
        
    def fit(self, X, y, n_epochs=5):
        self.n_batches = len(X) // self.batch_size
        
        n_inputs = X.shape[1]
        self.n_outputs = len(np.unique(y))
        self.classes_ = np.unique(y)
        
        self._build_graph(n_inputs, self.n_outputs)
        
        init = tf.global_variables_initializer()

        self._session = tf.Session()
        with self._session.as_default() as sess:
            init.run()
            for epoch in range(n_epochs):
                for X_batch, y_batch in self.shuffle_split(X, y, self.n_batches):
                    feed_dict = {self._X: X_batch, self._y: y_batch}
                    sess.run([self._training_op], feed_dict=feed_dict)
                accuracy_train = self._accuracy.eval(feed_dict=feed_dict)
                print(epoch, 'accuracy_train:', accuracy_train)
                
        return self
    
    def predict_proba(self, X):
        with self._session.as_default() as sess:
            y_proba = sess.run([self._y_proba], feed_dict={self._X: X})
            return np.array(y_proba).reshape((len(X), self.n_outputs))
        
    def predict(self, X):
        class_indices = np.argmax(self.predict_proba(X), axis=1)
        return np.array([[self.classes_[class_index]]
                         for class_index in class_indices], np.int32)

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('/tmp/data/')

X_train = mnist.train.images
y_train = mnist.train.labels