# TensorBoard

Use simple model (no our best model)

* Graph
    * network scoping is important
* Summaries
    * scalar
    * image
    * text
    * histogram

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import collections

In [2]:
tf.__version__

'1.2.1'

In [3]:
tf.set_random_seed(0) # does not ensure perfect reproducibility

In [4]:
mnist = input_data.read_data_sets('MNIST_data/', one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [5]:
slim = tf.contrib.slim

In [6]:
def as_text_matrix(dic):
    return [[k, str(w)] for k, w in sorted(dic.items())]

In [7]:
# same as the slim-BN model
def build_net(name, config):
    
    with tf.variable_scope(name):
#         config_summary = tf.summary.text('config',  tf.convert_to_tensor(as_text_matrix(config)), collections=[])
        with tf.variable_scope("input"):
            X = tf.placeholder(tf.float32, [None, 784], name='X')
            y = tf.placeholder(tf.float32, [None, 10], name='y')
        training = tf.placeholder(tf.bool, name='training')

        net = tf.reshape(X, [-1, 28, 28, 1])
        tf.summary.image('X', net)

        n_filters = 32
        bn_param = {'is_training': training, 'scale': config['bn_scale'], 'decay': config['bn_decay']}
        with slim.arg_scope([slim.conv2d], kernel_size=[3,3],
                            normalizer_fn=slim.batch_norm, normalizer_params=bn_param):
            for i in range(2):
                with tf.variable_scope('conv-block{}'.format(i)):
                    net = slim.conv2d(net, n_filters)
                    net = slim.max_pool2d(net, kernel_size=[2,2], padding='same')

                    n_filters *= 2

        with tf.variable_scope('fc'):
            flat = slim.flatten(net)
            logits = slim.fully_connected(flat, 10, activation_fn=None)

        prob = tf.nn.softmax(logits, name='prob')

        with tf.variable_scope('accuracy'):
            correct = tf.equal(tf.argmax(logits, axis=1), tf.argmax(y, axis=1))
            accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
        with tf.variable_scope('loss'):
            loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)
            loss = tf.reduce_mean(loss)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = tf.train.AdamOptimizer(learning_rate=config['learning_rate']).minimize(loss)

        tf.summary.scalar('acc', accuracy)
        tf.summary.scalar('loss', loss)
        # heavy operation
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)
        summary_op = tf.summary.merge_all()
        
        return X, y, training, train_op, summary_op, accuracy, loss

## Summaries

* Generally, official code evaluate results using checkpoints, therefore it does not need two `summary_writer` or `summary_op`.

Other usages:

(check)
Two summary_op:
```python
tf.summary.scalar('test/acc', accuracy)
tf.summary.scalar('test/loss', loss)
test_summary_op = tf.summary.merge_all('test/')
tf.summary.scalar('train/acc', accuracy)
tf.summary.scalar('train/loss', loss)
train_summary_op = tf.summary.merge_all('train/')
```

`summary.value`:
```python
precision_at_1 = count_top_1 / total_eval_count
recall_at_5 = count_top_5 / total_eval_count
summary = tf.Summary()
summary.value.add(tag='eval/Accuracy@1', simple_value=precision_at_1)
summary.value.add(tag='eval/Recall@5', simple_value=recall_at_5)
summary_writer.add_summary(summary, global_step)
```

In [8]:
def run_with_config(epoch_n, batch_size, learning_rate, bn_decay, bn_scale, reset_summary=False):
    ### CONFIG
    # for text summary
    name = 'tb-epoch{}-batch{}-lr{}-decay{}-scale{}'.format(epoch_n, batch_size, learning_rate, bn_decay, bn_scale)
    config = {
        'name': name,
        'epoch_n': epoch_n,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'bn_decay': bn_decay,
        'bn_scale': bn_scale
    }
    print('run {} ...'.format(name))

    ### BUILD NET
    # with the same network scope, you can see the graphs overlaped.
    # with the different network scope, the graphs for each network are separated.
    tf.reset_default_graph()
    X, y, training, train_op, summary_op, accuracy, loss = build_net('tb-tutorial', config=config)

    # As this is also op, so originally it should be in name scope of the build_net method.
    # Set `collections=[]` to prevent entry into the GraphKeys.SUMMARIES collections (default).
    config_summary = tf.summary.text('config', 
                                     tf.convert_to_tensor(as_text_matrix(config)), 
                                     collections=[])

    ### RUN
    if reset_summary and tf.gfile.Exists('./summary/mnist-tutorial'):
        tf.gfile.DeleteRecursively('./summary/mnist-tutorial')

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    dir_name = './summary/mnist-tutorial/' + name

    train_writer = tf.summary.FileWriter(dir_name + '/train', graph=sess.graph, flush_secs=10)
    test_writer = tf.summary.FileWriter(dir_name + '/test', flush_secs=10)

    # warning issue: https://github.com/tensorflow/tensorboard/issues/124
    # MetaGraph + tf.summary.text => warning (fixed in tf 1.3)
    train_writer.add_summary(config_summary.eval(session=sess))
    # test_writer.add_summary(config_summary.eval(session=sess)) 
    # test_writer does not work, I dont know why
    # I think text_summary still unstable

    N = mnist.train.num_examples
    n_iter = N // batch_size
    dq = collections.deque(maxlen=5)
    global_step = 0

    for epoch in range(epoch_n):
        avg_loss = 0.
        avg_acc = 0.
        for _ in range(n_iter):
            batch_x, batch_y = mnist.train.next_batch(batch_size)
            _, cur_acc, cur_loss, cur_summary = sess.run([train_op, accuracy, loss, summary_op], 
                                                         {X: batch_x, y: batch_y, training: True})
            avg_acc += cur_acc
            avg_loss += cur_loss

            train_writer.add_summary(cur_summary, global_step=global_step)
            global_step += 1

        avg_acc /= n_iter
        avg_loss /= n_iter

        feed_dict = {X: mnist.test.images, y: mnist.test.labels, training: False}
        test_acc, test_loss, cur_summary = sess.run([accuracy, loss, summary_op], feed_dict=feed_dict)
        test_writer.add_summary(cur_summary, global_step=global_step)

        print("[{:2}/{}] (train) acc: {:.2%}, loss: {:.3f} | (test) acc: {:.2%}, loss: {:.3f}".
              format(epoch+1, epoch_n, avg_acc, avg_loss, test_acc, test_loss))
        dq.append(test_acc)

    score = np.average(dq)
    print("average of last 5 test acc: {:.2%}\n".format(score))

    train_writer.close()
    test_writer.close()

In [9]:
run_with_config(10, 200, 0.002, 0.99, True, reset_summary=True)
run_with_config(10, 300, 0.003, 0.99, True)
run_with_config(10, 100, 0.001, 0.99, True)
run_with_config(10, 100, 0.001, 0.999, True)
run_with_config(10, 100, 0.001, 0.99, False)

run tb-epoch10-batch200-lr0.002-decay0.99-scaleTrue ...
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'
[ 1/10] (train) acc: 94.32%, loss: 0.206 | (test) acc: 57.75%, loss: 1.166
[ 2/10] (train) acc: 98.25%, loss: 0.058 | (test) acc: 96.42%, loss: 0.112
[ 3/10] (train) acc: 98.63%, loss: 0.043 | (test) acc: 96.78%, loss: 0.101
[ 4/10] (train) acc: 98.92%, loss: 0.034 | (test) acc: 98.24%, loss: 0.056
[ 5/10] (train) acc: 99.04%, loss: 0.030 | (test) acc: 98.28%, loss: 0.056
[ 6/10] (train) acc: 99.12%, loss: 0.027 | (test) acc: 98.18%, loss: 0.057
[ 7/10] (train) acc: 99.41%, loss: 0.018 | (test) acc: 98.62%, loss: 0.048
[ 8/10] (train) acc: 99.28%, loss: 0.021 | (test) acc: 98.67%, loss: 0.053
[ 9/10] (train) acc: 99.51%, loss: 0.015 | (test) acc: 98.97%, loss: 0.033
[10/10] (train) acc: 99.63%, loss: 0.011 | (test) acc: 98.66%, loss: 0.045
average of last 5 test acc: 98.62%

run tb-epoch10