# Hands On ML Chapter 11 - Training Deep Neural Nets

In [1]:
import tensorflow as tf
from functools import partial

## Vanishing/Exploding Gradients Problems

By default the tf.layers.dense() uses Xavier initialization, you can change it to He initialization by:

In [2]:
#he_init = tf.contrib.layers.variance_scaling_initializer()
#hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, kernel_initializer=he_init, name='hidden1')

Elu:

In [3]:
#hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, name="hidden1")

Leaky Relu:

In [4]:
def leaky_relu(z, name=None):
    return tf.maximum(0.01 *z, z, name=name)

#hidden1 = tf.layers.dense(X, n_hidden1, activation=leaky_relu, name="hidden1")

### Batch Normalization

Batch Normalization guarantee that vanishing/exploding gradient problems wont come back during training.

In [5]:
n_inputs = 28*28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
training = tf.placeholder_with_default(False, shape=(), name="training") # True or False value, True for training, False for testing
#^true/false value say if we want to use mean and st dev from mini batch (training) or whole set (testing)

hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1") # fully connected layers
bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
bn1_act = tf.nn.elu(bn1)
hidden2 = tf.layers.dense(bn1_act, n_hidden2, name="hidden2")
bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=0.9)
bn2_act = tf.nn.elu(bn2)
logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name="outputs")
logits = tf.layers.batch_normalization(logits_before_bn, training=training, momentum=0.9)

To avoid repetitive code you can use partial from standard python tools:

In [2]:
my_batch_norm_layer = partial(tf.layers.batch_normalization, training=training, momentum=0.9)

hidden1 = tf.layers.dense(X, n_hidden1, name='hidden1')
bn1 = my_batch_norm_layer(hidden1)
bn1_act = tf.nn.elu(bn1)
hidden2 = tf.layers.dense(hidden1, n_hidden2, name='hidden2')
bn2 = my_batch_norm_layer(hidden2)
bn2_act = tf.nn.elu(bn2)
logits_before_bn = tf.layers.dense(hidden2, n_outputs, name='outputs')
logits = my_batch_norm_layer(logits_before_bn)

NameError: name 'training' is not defined

Execution phase: (for more see previous chapter about GD with ANN)

In [None]:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run([training_op, extra_update_ops],
                    feed_dict={training: True, X: X_batch, y: y_batch})
            accuracy_val = accuracy.eval(feed_dict = {X: mnist.test.images,
                                                      y: mnist.test.labels})
            print(epoch, "Test accuracy:", accuracy_val)
            
        save_path = saver.save(sess, "./my_model_final.ckpt")

### Gradient Clipping

Not that populat as Batch Normalization but it is worth to be known. It compute gradients, then clip them betwen -threshold and threshold.

In [None]:
threshold = 1.0
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
              for grad, var in grads_and_vars]
training_op = optimizer.apply_gradients(capped_gvs)

## Reusing Pretrained Layers

It is not good idea to train a very large DNN from scratch. Better idea is to try to find existing model and use part of it to your task. Reuse the lower layers of this network.

### Reusing a TensorFlow Model

You can restore your old tensorflow model followoing these steps:

In [3]:
saver = tf.train.import_meta_graph("./my_model_final.ckpt.meta") #import operations into the default graph

#get a handle on the tensors and operations you need for training
X = tf.get_default_graph().get_tensor_by_name("X:0") # X is name, 0 is the first output, 1 is second output, 2 third output...
y = tf.get_default_graph().get_tensor_by_name("y:0")
accuracy = tf.get_default_graph().get_tensor_by_name("eval/accuracy:0")
training_op = tf.get_default_graph().get_operation_by_name("GradientDescent")

#if you dont know names of operations you can chect it by TensorBoard or by:
for op in tf.get_default_graph().get_operations():
    print(op.name)
    
#it you are the author of the original model, you can give models clear names or create good documentation
#you can also create collection containing all important operations:
for op in (X, y, accuracy, training_op):
    tf.add_to_collection("my_important_ops", op)
#to reuse it people simply can write:
X, y, accuracy, training_op = tf.get_collection("my_important_ops")

#in general you will need to reuse just a part of the original model, typically the lower layers
#the code below restores only hidden layers 1,2 and 3

#[....] build the new model with the same hidden layers 1-3 as before
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden[123]") #regular expression
reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])
restore_saver = tf.train.Saver(reuse_vars_dict) #to restore layers 1-3

init = tf.global_variables_initializer() #to init all variables, old and new
saver = tf.train.Saver() #to save the new model

with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess, "./my_model_final.ckpt")
    #[....] train the model
    save_path = saver.save(sess, "./my_new_model_final.ckpt")

The more similar the tasks are the more layers you can reuse. Sometimes you can only change the output layer.

### Reusing Models from Other Frameworks

If the model was trained with another python framework (for example Theano), you will need to load the model parameters manually. The code below is is example how to copy the weight and biases from the first hidden layer of model trained using another framework.

In [None]:
original_w = #[...] load weights from other framework
original_b = #[...] load biases from other framework

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name='hidden1')
#[...] build the rest of the model

graph = tf.get_default_graph()
assign_kernel = graph.get_operation_by_name("hidden1/kernel/Assign")
assing_bias = graph.get_operation_by_name("hidden1/bias/Assign")
init_kernel = assign_kernel.inputs[1]
init_bias = assign_bias.inputs[1]

init = tf.global_variables_initializer()

with tf.Session as sess:
    sess.run(init, feed_dict={init_kernel: original_w, init_bias: original_b})
    #[...] train the model on your new task

### Freezing the Lower Layers

Imagine you use old DNN picture-classifier model to classify new pictures. The lower layers of DNN learned to detected low-level features in pictures that will be useful also for your actual task. It is generally good idea to freeze weight of pretrained lower layers when training new DNN.

In [None]:
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 
                               scope="hidden[34]|outputs") # gets the list of all trainable variables in hidden layers 3,4 
                                                           # and in the output layer
training_op = optimizer.minimize(loss, var_list=train_vars) # proviced list of layers to the optimizer
                                                            #so layers 1 and 2 are frozen

Another option: add a stop_gradient() layer in the graph. Any layer below it will be frozen

In [None]:
with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1") #reused frozen
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2") #reused frozen
    hiden2_stop = tf.stop_gradient(hidden2)
    hidden3 = tf.layers.dense(hidden2_stop, n_hidden3, activation=tf.nn.relu, name="hidden3") #reused not frozen
    hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4") #new
    logits = tf.layers.dense(n_hidden4, n_outputs, name="outputs") #new

### Caching the Frozen Layers

Since the frozen layers dont change you can use the outputs of the topmost frozen layer for each training instance. You can store(cache) it. It will give you huge speed boost as you will go through frozen layers once per trainin instance. Then feed training operations with these outputs.

In [None]:
n_batches = mnist.train.num_examples // batch_size

with tf.Session as sess:
    init.run()
    restore_saver.restore(sess, "./my_model_final.ckpt")
    
    h2_cache = sess.run(hidden2, feed_dict={X: mnist.train.images})
    
    for epoch in range(n_epochs):
        shuffled_idx = np.random.permutation(mnist.train.num_examples)
        hidden2_batches = np.array_split(h2_cache[shuffled_idx], n_batches)
        y_batches = np.array_split(mnist.train.labels[shuffled_idx], n_batches)
        for hidden2_batch, y_batch in zip(hidden2_batches, y_batches):
            sess.run(training_op, feed_dict={hidden2:hidden2_batch, y:y_batch})
            
        save_path = saver.save(sess, "./my_new_model_final.ckpt")