# Saving and restoring models

TensorFlow gives you the ability to save your progress using tf.train.Saver. This class provides the functionality to save any tf.Variable to your file system.



In [1]:
import tensorflow as tf

# The file path to save the data
save_file = './model.ckpt'

# Two Tensor Variables: weights and bias
weights = tf.Variable(tf.truncated_normal([2, 3]))
bias = tf.Variable(tf.truncated_normal([3]))

# Class used to save and/or restore Tensor Variables
saver = tf.train.Saver()

with tf.Session() as sess:
    # Initialize all the Variables
    sess.run(tf.global_variables_initializer())

    # Show the values of weights and bias
    print('Weights:')
    print(sess.run(weights))
    print('Bias:')
    print(sess.run(bias))

    # Save the model
    saver.save(sess, save_file)

Weights:
[[-1.43552387 -0.70340478 -0.78335518]
 [ 1.48438513 -1.12358761  1.57190645]]
Bias:
[ 0.03582934  1.22028112  0.41342628]


# Load the variables back

In [2]:
# Remove the previous weights and bias
tf.reset_default_graph()

# Two Variables: weights and bias
weights = tf.Variable(tf.truncated_normal([2, 3]))
bias = tf.Variable(tf.truncated_normal([3]))

# Class used to save and/or restore Tensor Variables
saver = tf.train.Saver()

with tf.Session() as sess:
    # Load the weights and bias
    saver.restore(sess, save_file)

    # Show the values of weights and bias
    print('Weight:')
    print(sess.run(weights))
    print('Bias:')
    print(sess.run(bias))

Weight:
[[-1.43552387 -0.70340478 -0.78335518]
 [ 1.48438513 -1.12358761  1.57190645]]
Bias:
[ 0.03582934  1.22028112  0.41342628]


# Save a _trained_ model

In [3]:
# Remove previous Tensors and Operations
tf.reset_default_graph()

from tensorflow.examples.tutorials.mnist import input_data
import numpy as np

learning_rate = 0.001
n_input = 784  # MNIST data input (img shape: 28*28)
n_classes = 10  # MNIST total classes (0-9 digits)

# Import MNIST data
mnist = input_data.read_data_sets('.', one_hot=True)

# Features and Labels
features = tf.placeholder(tf.float32, [None, n_input])
labels = tf.placeholder(tf.float32, [None, n_classes])

# Weights & bias
weights = tf.Variable(tf.random_normal([n_input, n_classes]))
bias = tf.Variable(tf.random_normal([n_classes]))

# Logits - xW + b
logits = tf.add(tf.matmul(features, weights), bias)

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)

# Calculate accuracy
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

Extracting ./train-images-idx3-ubyte.gz
Extracting ./train-labels-idx1-ubyte.gz
Extracting ./t10k-images-idx3-ubyte.gz
Extracting ./t10k-labels-idx1-ubyte.gz


In [4]:
import math

save_file = './train_model.ckpt'
batch_size = 128
n_epochs = 100

saver = tf.train.Saver()

# Launch the graph
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Training cycle
    for epoch in range(n_epochs):
        total_batch = math.ceil(mnist.train.num_examples / batch_size)

        # Loop over all batches
        for i in range(total_batch):
            batch_features, batch_labels = mnist.train.next_batch(batch_size)
            sess.run(
                optimizer,
                feed_dict={features: batch_features, labels: batch_labels})

        # Print status for every 10 epochs
        if epoch % 10 == 0:
            valid_accuracy = sess.run(
                accuracy,
                feed_dict={
                    features: mnist.validation.images,
                    labels: mnist.validation.labels})
            print('Epoch {:<3} - Validation Accuracy: {}'.format(
                epoch,
                valid_accuracy))

    # Save the model
    saver.save(sess, save_file)
    print('Trained Model Saved.')

Epoch 0   - Validation Accuracy: 0.09120000153779984
Epoch 10  - Validation Accuracy: 0.26179999113082886
Epoch 20  - Validation Accuracy: 0.42100000381469727
Epoch 30  - Validation Accuracy: 0.5070000290870667
Epoch 40  - Validation Accuracy: 0.5685999989509583
Epoch 50  - Validation Accuracy: 0.6132000088691711
Epoch 60  - Validation Accuracy: 0.6456000208854675
Epoch 70  - Validation Accuracy: 0.6710000038146973
Epoch 80  - Validation Accuracy: 0.6940000057220459
Epoch 90  - Validation Accuracy: 0.7089999914169312
Trained Model Saved.


# Load a trained model
We don't need to spend all that time retraining the model since we can just reload the parameters.

In [5]:
saver = tf.train.Saver()

# Launch the graph
with tf.Session() as sess:
    saver.restore(sess, save_file)

    test_accuracy = sess.run(
        accuracy,
        feed_dict={features: mnist.test.images, labels: mnist.test.labels})

print('Test Accuracy: {}'.format(test_accuracy))

Test Accuracy: 0.718500018119812


# Loading weights and biases into new model

- Sometimes we want to adjust/finetune the parameters of a model that we've already trained and saved
- Loading saved variables directly into a modified model can lead to errors

## Naming error

- Tensorflow uses string identifier for Tensors and Operations called `name`.
- If a name is not given, TF will create one automatically
- TF will give the first node the name `<Type>` and give the name `<Type>_<number>` for subsequent nodes

See how this can affect loading a model with a different order of `weights` and `bias`

**N.B.** the below code should yield an **error**

In [9]:
import tensorflow as tf

#remove prev weights and bias
tf.reset_default_graph()

save_file = './model.ckpt'

#two tensor variables: weights and bias
weights = tf.Variable(tf.truncated_normal([2,3]))
bias = tf.Variable(tf.truncated_normal([3]))

saver = tf.train.Saver()

#print the name of weights and bias
print('Save Weights: {}'.format(weights.name))
print('Save Bias:{}'.format(bias.name))

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver.save(sess, save_file)
    
#remove the previous weights and bias
tf.reset_default_graph()

# Two Variables: weights and bias
bias = tf.Variable(tf.truncated_normal([3]))
weights = tf.Variable(tf.truncated_normal([2, 3]))

saver = tf.train.Saver()

# Print the name of Weights and Bias
print('Load Weights: {}'.format(weights.name))
print('Load Bias: {}'.format(bias.name))

with tf.Session() as sess:
    # Load the weights and bias - ERROR
    saver.restore(sess, save_file)

Above code results in
>Save Weights: Variable:0

>Save Bias:Variable_1:0

>Load Weights: Variable_1:0

>Load Bias: Variable:0

>InvalidArgumentError: Assign requires shapes of both tensors to match. lhs shape= [2,3] rhs shape= [3]

We need to set the name property manually instead of letting TF do it automatically.

In [7]:
import tensorflow as tf

tf.reset_default_graph()

save_file = './model.ckpt'

# Two Tensor Variables: weights and bias
weights = tf.Variable(tf.truncated_normal([2, 3]), name='weights_0')
bias = tf.Variable(tf.truncated_normal([3]), name='bias_0')

saver = tf.train.Saver()

# Print the name of Weights and Bias
print('Save Weights: {}'.format(weights.name))
print('Save Bias: {}'.format(bias.name))

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver.save(sess, save_file)

# Remove the previous weights and bias
tf.reset_default_graph()

# Two Variables: weights and bias
bias = tf.Variable(tf.truncated_normal([3]), name='bias_0')
weights = tf.Variable(tf.truncated_normal([2, 3]) ,name='weights_0')

saver = tf.train.Saver()

# Print the name of Weights and Bias
print('Load Weights: {}'.format(weights.name))
print('Load Bias: {}'.format(bias.name))

with tf.Session() as sess:
    # Load the weights and bias - No Error
    saver.restore(sess, save_file)

print('Loaded Weights and Bias successfully.')

Save Weights: weights_0:0
Save Bias: bias_0:0
Load Weights: weights_0:0
Load Bias: bias_0:0
Loaded Weights and Bias successfully.


# How to prevent overfitting

- For deep networks, a network that is _just_ the right size for a dataset is very hard to optimize
- In practice, we always use networks that are _way_ too big for our data
- Then try our best to prevent them from overfitting

## Early Termination
- Look at performance on validation set and stop training as soon as we stop improving
<img src="./images/week6/earlyterm.png" alt="early-termination" style="width:400px">

## Regularization
- Add artificial constraints on our network 
- Implicitly reduce the number of free parameters
- While not making the network too difficult to train

### L2 Regularization
- Add another term to the loss that penalizes the loss
- Add the L2-norm of weights to the loss multiplied by small constant
- Yet another hyperparameter to tune
- The structure of the network does not change

<img src="./images/week6/L2.png" alt="L2-regularization" style="width:400px">

Derivative of L2 norm of vector

$$L_2 = \beta \frac{1}{2} \|w\|^2_2$$
$$L'_2 = w $$

## Dropout
- For all activations going from one layer to the next
- Randomly set half of them to _zero_!

<img src="./images/week6/dropout1.png" alt="dropout" style="width:400px">

- Basically take half the data flowing through the network and destroy it
- Pioneered by G. Hinton

<img src="./images/week6/dropout2.png" alt="redundancy" style="width:400px">

- The network _can't_ rely on any given input 
- That input might be squashed via dropout
- It is forced to learn a **redundant representation** for everything
- Ensures that information will persist
- It makes the network act as if it is taking a consensus over an ensemble of inputs

<img src="./images/week6/dropout3.png" alt="consensus" style="width:400px">

### How do we deal with this redundant representation learned by dropout?
- We want the network to take the consensus of the activations
- This can be done via *average pooling* the activations

<img src="./images/week6/dropout4.png" alt="pooling" style="width:400px">

- We want $Y_e$ to be the **average** of all the $Y_t$ that we got during training
- Not only do we set half our activations randomly to zero
- Also scale the remaining activations by a **factor of 2**

$$Y_e ~ E(Y_t)$$

- We remove the factor scaling when averaging the activations


# Tensorflow Dropout

The `tf.nn.dropout()` function takes in two parameters:

1. `hidden_layer`: the tensor to which you would like to apply dropout
2. `keep_prob`: the probability of keeping (i.e. not dropping) any given unit


- `keep_prob` allows us to adjust the # units to drop
- `tf.nn.dropout()` multiplies all units _not_ dropped by `1/keep_prob`
- A good `keep_prob` to use is `0.5`
- During testing, use `keep_prob` of `1.0` to keep all units and maximuze the power of the model

In [8]:
import tensorflow as tf

hidden_layer_weights = [
    [0.1, 0.2, 0.4],
    [0.4, 0.6, 0.6],
    [0.5, 0.9, 0.1],
    [0.8, 0.2, 0.8]]
out_weights = [
    [0.1, 0.6],
    [0.2, 0.1],
    [0.7, 0.9]]

# Weights and biases
weights = [
    tf.Variable(hidden_layer_weights),
    tf.Variable(out_weights)]
biases = [
    tf.Variable(tf.zeros(3)),
    tf.Variable(tf.zeros(2))]

# Input
features = tf.Variable([[0.0, 2.0, 3.0, 4.0], [0.1, 0.2, 0.3, 0.4], [11.0, 12.0, 13.0, 14.0]])

# TODO: Create Model with Dropout
keep_prob = tf.placeholder(tf.float32)
hidden_layer = tf.add(tf.matmul(features,weights[0]),biases[0])
hidden_layer = tf.nn.relu(hidden_layer)
hidden_layer = tf.nn.dropout(hidden_layer,keep_prob)

logits = tf.add(tf.matmul(hidden_layer,weights[1]),biases[1])

# TODO: Print logits from a session
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    output = sess.run(logits,feed_dict={keep_prob:0.5})
    print(output)

[[  7.67999935  15.05999947]
 [  0.71400005   0.91800004]
 [  9.56000042   4.78000021]]
