# Chapter 11 Training Deep Neural Nets

## Batch Normalization with Tensorflow

In [1]:
import tensorflow as tf

n_inputs = 28*28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

# input data placeholder
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")

# tells batch normalization whether to use current mini-batch's mean & s.d. (when training)
# or use the whole training set's mean & s.d. (during use)
# Start with it False
training = tf.placeholder_with_default(False, shape=(), name="training")

# first layer of hidden nodes densely connected to the input
hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1")
# apply batch normalization to the hidden1 output, with a momentum term
bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
# apply activation function (ELU in this case) to the batch normalized output of 1st hidden layer
bn1_act = tf.nn.elu(bn1)

# same again for second hidden layer based on final activated and normalized first hidden layer output
hidden2 = tf.layers.dense(bn1_act, n_hidden1, name="hidden2")
bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=0.9)
bn2_act = tf.nn.elu(bn2)

# finally the output layer
# first up the logits raw output
logits_pre_bn = tf.layers.dense(bn2_act, n_outputs, name="outputs")
logits = tf.layers.batch_normalization(logits_pre_bn, training=training, momentum=0.9)


  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
# pain to keep repeating the args to batch_normalization
# partial from functools is your friend

from functools import partial

my_batch_norm_layer = partial(tf.layers.batch_normalization, training=training, momentum=0.9)

# then bn lines can become e.g.
bn1 = my_batch_norm_layer(hidden1)

In [None]:
# assume have the usual init, optimizer, training_op, eval and savers

# But also for this need to set training to True while actually training
# AND there are some every operations that need to be run during each training step so that the moving averages update
# there's an automatic UPDATE_OPS collection for that

extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            # here's the deviation: we also eval extra_update_ops AND override training flag to True
            sess.run([training_op, extra_update_ops], feed_dict={training: True, X: X_batch, y: y_batch})
        # and here when testing (so actually using the network rather than training), no need to override training flag
        accuracy_val = accuracy.eval(feed_dict={X: mnist.validation.imgges, y: mnist.validation.labels})
        print(epoch, "Test accuracy:", accuracy_val)
        
    save_path = saver.save(sess, "./my_model_final.ckpt")

## Gradient clipping

In [4]:
learning_rate=0.01

# set a threshold for gradients
threshold = 1.0

# create an optimizer like normal
optimizer = tf.train.GradientDescentOptimizer(learning_rate)

# but now use it to get the gradients
grads_and_vars = optimizer.compute_gradients(loss)

# then cap them (-ve and +ve)
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
             for grad, var in grads_and_vars]

# and use the capped ones to adjust params
training_op = optimizer.apply_gradients(capped_gvs)

NameError: name 'loss' is not defined

## Transfer learning in Tensorflow

In [None]:
# use import_meta_graph to load the operations of previous model into default graph
saver = tf.train.import_meta_graph("./my_model_final.ckpt.meta")
# Or if you have access to it, you could use the original source code to build the graph

# then use get_tensor_by_name to get tensors of pretrained model
# NB for tensors uses name of op with any name scope AND the index to outputs of op
# hopefully it is well documented or else can write out the graph and explore in tensor board
# OR the get_operations method
X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
accuracy = tf.get_default_graph().get_tensor_by_name("eval/accuracy:0")

# or for ops use get_operation_by_name
training_op = tf.get_default_graph().get_operationp("GradientDescent")

In [None]:
# When writing models, can make re-use easier by creating collections containin useful subsets of operations
for op in (X, y, accuracy, training_op):
    tf.add_to_collection("my_important_ops", op)
    
# then anyone else can just get the collection
X, y, accuracy, training_op = tf.get_collection("my_important_ops")

# Then use the saver in a session to restore the model state as normal
with tf.Session() as sess:
    saver.restore(sess, "./my_model_final.ckpt")
    # train the various bits you want with your own data

It is also possible to import model params from other frameworks (e.g. Theano) but it gets more painful.
Once you have the model recreated as a TF graph and the weights loaded from the other model (will require code from the original framework) than can use the makei Assign operator for each variable to initialize it:

In [None]:
# suppose have original_w and original_b from other framework and have built the graph - so each have a hidden1 layer
graph = tf.get_default_graph()
assign_kernal = graph.get_operation_by_name("hidden1/kernel/Assign")
assign_bias = graph.get_operation_by_name("hidden1/bias/Assign")
init_kernal = assign_kernel.inputs[1]
init_bias = assign_bias.inputs[1]

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init, feed_dict={init_kernel: original_w, init_bias: original_b})
    # ... train the model on new task

The above will work to start from the pre-trained model then continue to learn all the parameters.

Can speed up training more if can

### Freeze lower layers
So assume that the lower layers of the DNN are sufficiently trained (e.g. correctly working for edge detection say in an image classifier) and that actually only need to train the parameters of the higher layers more.

In Tensorflow there are two ways to achieve this:

In [None]:
# explicitly set the list of variables that the optimizer can alter
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="hidden[34]|outputs")
training_op = optimizer.minimum(loss, var_list=train_vars)

In [None]:
# Or add a stop_gradient layer to the graph to ensure won't train below it
with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(...)# to be reused frozen
    hidden2 = tf.layers.dense(...)# to be reused frozen
    # ensure layers up to here are frozen
    hidden2_stop = tf.stop_gradient(hidden2)
    hidden3 = tf.layers.dense(hidden2_stop, ...) # reused layer but not frozen OR could be an entirely new layer on top
    ...

### Caching the frozen layers
For the frozen lower layers and a given dataset, the output of these lower layers will be unchanged each run for each example. Can get a speed up therefore by injecting a cached version of the lower layer output during training rather than recomputing it from scratch each time. Obviously with new examples need to make sure don't use this cache!

In [None]:
import numpy as np

n_batches = mnist.train.num_examples // batch_size

with tf.Session() as sess:
    init.run()
    # restore the previously trained model
    restore_saver.restore(sess, "./my_model_final.ckpy")
    
    # compute once the output for the frozen layers
    h2_cache = sess.run(hidden2, feed_dict={X: mnist.train.images})
    
    for epoch in range(n_epochs):
        # shuffle the training images
        shuffled_idx = np.random.permutation(mnist.train.num_examples)
        # create training batches (examples & labels) based on the cached output of frozen layers
        # instead of raw training examples & labels
        hidden2_batches = np.array_split(h2_cache[shuffled_idx], n_batches)
        y_batches = np.array_split(mnist.train.labels[shuffled_idx], n_batches)
        # running training op for each batch but now injecting the cached output of frozen layers instead of X
        for hidden2_batch, y_batch in zip(hidden2_batches, y_batches):
            sess.run(training_op, feed_dict={hidden2: hidden2_batch, y:y_batch})
    
    # save the new model based on the frozen old layers but with some more training with new examples (and maybe a new topology)
    save_path = saver.save(sess, "./my_new_model_final.ckpt")