In [1]:
%cd /home/kangway/cifar10

/home/kangway/cifar10


2 x NVIDIA GEFORCE 1080Ti GPUs, set num_gpus=2 under flags in script.

In [2]:
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""A binary to train CIFAR-10 using multiple GPU's with synchronous updates.

Accuracy:
cifar10_multi_gpu_train.py achieves ~86% accuracy after 100K steps (256
epochs of data) as judged by cifar10_eval.py.

Speed: With batch_size 128.

System        | Step Time (sec/batch)  |     Accuracy
--------------------------------------------------------------------
1 Tesla K20m  | 0.35-0.60              | ~86% at 60K steps  (5 hours)
1 Tesla K40m  | 0.25-0.35              | ~86% at 100K steps (4 hours)
2 Tesla K20m  | 0.13-0.20              | ~84% at 30K steps  (2.5 hours)
3 Tesla K20m  | 0.13-0.18              | ~84% at 30K steps
4 Tesla K20m  | ~0.10                  | ~84% at 30K steps

Usage:
Please see the tutorial and website for how to download the CIFAR-10
data set, compile the program and train the model.

http://tensorflow.org/tutorials/deep_cnn/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from datetime import datetime
import os.path
import re
import time

import numpy as np
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
import cifar10

FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train',
                           """Directory where to write event logs """
                           """and checkpoint.""")
tf.app.flags.DEFINE_integer('max_steps', 1000000,
                            """Number of batches to run.""")
tf.app.flags.DEFINE_integer('num_gpus', 2,
                            """How many GPUs to use.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
                            """Whether to log device placement.""")


def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.

  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = cifar10.distorted_inputs()

  # Build inference Graph.
  logits = cifar10.inference(images)

  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)

  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)

  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')

  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    tf.summary.scalar(loss_name, l)

  return total_loss


def average_gradients(tower_grads):
  """Calculate the average gradient for each shared variable across all towers.

  Note that this function provides a synchronization point across all towers.

  Args:
    tower_grads: List of lists of (gradient, variable) tuples. The outer list
      is over individual gradients. The inner list is over the gradient
      calculation for each tower.
  Returns:
     List of pairs of (gradient, variable) where the gradient has been averaged
     across all towers.
  """
  average_grads = []
  for grad_and_vars in zip(*tower_grads):
    # Note that each grad_and_vars looks like the following:
    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
    grads = []
    for g, _ in grad_and_vars:
      # Add 0 dimension to the gradients to represent the tower.
      expanded_g = tf.expand_dims(g, 0)

      # Append on a 'tower' dimension which we will average over below.
      grads.append(expanded_g)

    # Average over the 'tower' dimension.
    grad = tf.concat(axis=0, values=grads)
    grad = tf.reduce_mean(grad, 0)

    # Keep in mind that the Variables are redundant because they are shared
    # across towers. So .. we will just return the first tower's pointer to
    # the Variable.
    v = grad_and_vars[0][1]
    grad_and_var = (grad, v)
    average_grads.append(grad_and_var)
  return average_grads


def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default(), tf.device('/cpu:0'):
    # Create a variable to count the number of train() calls. This equals the
    # number of batches processed * FLAGS.num_gpus.
    global_step = tf.get_variable(
        'global_step', [],
        initializer=tf.constant_initializer(0), trainable=False)

    # Calculate the learning rate schedule.
    num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                             FLAGS.batch_size)
    decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
                                    global_step,
                                    decay_steps,
                                    cifar10.LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)

    # Create an optimizer that performs gradient descent.
    opt = tf.train.GradientDescentOptimizer(lr)

    # Calculate the gradients for each model tower.
    tower_grads = []
    with tf.variable_scope(tf.get_variable_scope()):
      for i in xrange(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % i):
          with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
            # Calculate the loss for one tower of the CIFAR model. This function
            # constructs the entire CIFAR model but shares the variables across
            # all towers.
            loss = tower_loss(scope)

            # Reuse variables for the next tower.
            tf.get_variable_scope().reuse_variables()

            # Retain the summaries from the final tower.
            summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

            # Calculate the gradients for the batch of data on this CIFAR tower.
            grads = opt.compute_gradients(loss)

            # Keep track of the gradients across all towers.
            tower_grads.append(grads)

    # We must calculate the mean of each gradient. Note that this is the
    # synchronization point across all towers.
    grads = average_gradients(tower_grads)

    # Add a summary to track the learning rate.
    summaries.append(tf.summary.scalar('learning_rate', lr))

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))

    # Apply the gradients to adjust the shared variables.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
      summaries.append(tf.summary.histogram(var.op.name, var))

    # Track the moving averages of all trainable variables.
    variable_averages = tf.train.ExponentialMovingAverage(
        cifar10.MOVING_AVERAGE_DECAY, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    # Group all updates to into a single train op.
    train_op = tf.group(apply_gradient_op, variables_averages_op)

    # Create a saver.
    saver = tf.train.Saver(tf.global_variables())

    # Build the summary operation from the last tower summaries.
    summary_op = tf.summary.merge(summaries)

    # Build an initialization operation to run below.
    init = tf.global_variables_initializer()

    # Start running operations on the Graph. allow_soft_placement must be set to
    # True to build towers on GPU, as some of the ops do not have GPU
    # implementations.
    sess = tf.Session(config=tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = duration / FLAGS.num_gpus

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)


def main(argv=["--num_gpus=2"]):  # pylint: disable=unused-argument
  cifar10.maybe_download_and_extract()
  if tf.gfile.Exists(FLAGS.train_dir):
    tf.gfile.DeleteRecursively(FLAGS.train_dir)
  tf.gfile.MakeDirs(FLAGS.train_dir)
  train()


if __name__ == '__main__':
  tf.app.run()


Filling queue with 20000 CIFAR images before starting to train. This will take a few minutes.
Filling queue with 20000 CIFAR images before starting to train. This will take a few minutes.
2017-05-11 22:15:06.846400: step 0, loss = 4.68 (31.7 examples/sec; 4.040 sec/batch)
2017-05-11 22:15:07.521177: step 10, loss = 4.63 (4433.1 examples/sec; 0.029 sec/batch)
2017-05-11 22:15:08.081749: step 20, loss = 4.48 (4594.3 examples/sec; 0.028 sec/batch)
2017-05-11 22:15:08.649922: step 30, loss = 4.35 (4397.6 examples/sec; 0.029 sec/batch)
2017-05-11 22:15:09.222496: step 40, loss = 4.39 (4073.3 examples/sec; 0.031 sec/batch)
2017-05-11 22:15:09.796777: step 50, loss = 4.28 (4651.2 examples/sec; 0.028 sec/batch)
2017-05-11 22:15:10.366784: step 60, loss = 4.14 (4360.7 examples/sec; 0.029 sec/batch)
2017-05-11 22:15:10.934858: step 70, loss = 4.11 (4366.5 examples/sec; 0.029 sec/batch)
2017-05-11 22:15:11.507624: step 80, loss = 4.15 (4529.8 examples/sec; 0.028 sec/batch)
2017-05-11 22:15:12.069

2017-05-11 22:15:59.232644: step 910, loss = 2.44 (4453.8 examples/sec; 0.029 sec/batch)
2017-05-11 22:15:59.805602: step 920, loss = 2.43 (4771.7 examples/sec; 0.027 sec/batch)
2017-05-11 22:16:00.374070: step 930, loss = 2.73 (4351.5 examples/sec; 0.029 sec/batch)
2017-05-11 22:16:00.939163: step 940, loss = 2.54 (4761.6 examples/sec; 0.027 sec/batch)
2017-05-11 22:16:01.529063: step 950, loss = 2.27 (3918.4 examples/sec; 0.033 sec/batch)
2017-05-11 22:16:02.100565: step 960, loss = 2.38 (4397.8 examples/sec; 0.029 sec/batch)
2017-05-11 22:16:02.673483: step 970, loss = 2.48 (4790.0 examples/sec; 0.027 sec/batch)
2017-05-11 22:16:03.252098: step 980, loss = 2.19 (4081.3 examples/sec; 0.031 sec/batch)
2017-05-11 22:16:03.828570: step 990, loss = 2.72 (4168.1 examples/sec; 0.031 sec/batch)
2017-05-11 22:16:04.404508: step 1000, loss = 2.53 (4629.5 examples/sec; 0.028 sec/batch)
2017-05-11 22:16:05.079745: step 1010, loss = 2.37 (4377.2 examples/sec; 0.029 sec/batch)
2017-05-11 22:16:05

2017-05-11 22:16:52.492129: step 1830, loss = 1.69 (5174.5 examples/sec; 0.025 sec/batch)
2017-05-11 22:16:53.078213: step 1840, loss = 1.70 (3897.9 examples/sec; 0.033 sec/batch)
2017-05-11 22:16:53.648418: step 1850, loss = 1.44 (4157.1 examples/sec; 0.031 sec/batch)
2017-05-11 22:16:54.229532: step 1860, loss = 1.99 (4104.6 examples/sec; 0.031 sec/batch)
2017-05-11 22:16:54.804885: step 1870, loss = 1.83 (4817.3 examples/sec; 0.027 sec/batch)
2017-05-11 22:16:55.373272: step 1880, loss = 1.65 (4280.8 examples/sec; 0.030 sec/batch)
2017-05-11 22:16:55.955595: step 1890, loss = 1.50 (4910.1 examples/sec; 0.026 sec/batch)
2017-05-11 22:16:56.529564: step 1900, loss = 1.58 (4291.0 examples/sec; 0.030 sec/batch)
2017-05-11 22:16:57.152313: step 1910, loss = 1.51 (4087.5 examples/sec; 0.031 sec/batch)
2017-05-11 22:16:57.735921: step 1920, loss = 1.60 (4673.0 examples/sec; 0.027 sec/batch)
2017-05-11 22:16:58.308834: step 1930, loss = 1.43 (4265.7 examples/sec; 0.030 sec/batch)
2017-05-11

2017-05-11 22:17:45.849003: step 2750, loss = 1.57 (4432.4 examples/sec; 0.029 sec/batch)
2017-05-11 22:17:46.419106: step 2760, loss = 1.31 (4668.3 examples/sec; 0.027 sec/batch)
2017-05-11 22:17:47.004313: step 2770, loss = 1.08 (3768.1 examples/sec; 0.034 sec/batch)
2017-05-11 22:17:47.583398: step 2780, loss = 1.26 (3810.1 examples/sec; 0.034 sec/batch)
2017-05-11 22:17:48.144452: step 2790, loss = 1.11 (4449.0 examples/sec; 0.029 sec/batch)
2017-05-11 22:17:48.709741: step 2800, loss = 1.27 (4834.0 examples/sec; 0.026 sec/batch)
2017-05-11 22:17:49.364214: step 2810, loss = 1.21 (4635.7 examples/sec; 0.028 sec/batch)
2017-05-11 22:17:49.949714: step 2820, loss = 1.21 (4371.1 examples/sec; 0.029 sec/batch)
2017-05-11 22:17:50.520806: step 2830, loss = 1.31 (4493.0 examples/sec; 0.028 sec/batch)
2017-05-11 22:17:51.088865: step 2840, loss = 1.15 (4327.4 examples/sec; 0.030 sec/batch)
2017-05-11 22:17:51.648131: step 2850, loss = 1.29 (4829.1 examples/sec; 0.027 sec/batch)
2017-05-11

2017-05-11 22:18:39.167481: step 3670, loss = 1.24 (4539.9 examples/sec; 0.028 sec/batch)
2017-05-11 22:18:39.744247: step 3680, loss = 1.08 (4553.7 examples/sec; 0.028 sec/batch)
2017-05-11 22:18:40.320899: step 3690, loss = 0.96 (4317.4 examples/sec; 0.030 sec/batch)
2017-05-11 22:18:40.902439: step 3700, loss = 1.07 (4888.0 examples/sec; 0.026 sec/batch)
2017-05-11 22:18:41.548450: step 3710, loss = 1.00 (4272.1 examples/sec; 0.030 sec/batch)
2017-05-11 22:18:42.129007: step 3720, loss = 1.05 (4372.2 examples/sec; 0.029 sec/batch)
2017-05-11 22:18:42.696340: step 3730, loss = 0.86 (4391.9 examples/sec; 0.029 sec/batch)
2017-05-11 22:18:43.277970: step 3740, loss = 1.04 (4545.7 examples/sec; 0.028 sec/batch)
2017-05-11 22:18:43.850838: step 3750, loss = 0.95 (4965.5 examples/sec; 0.026 sec/batch)
2017-05-11 22:18:44.417849: step 3760, loss = 0.95 (5002.7 examples/sec; 0.026 sec/batch)
2017-05-11 22:18:44.983383: step 3770, loss = 1.05 (4470.8 examples/sec; 0.029 sec/batch)
2017-05-11

2017-05-11 22:19:32.536339: step 4590, loss = 1.04 (4463.0 examples/sec; 0.029 sec/batch)
2017-05-11 22:19:33.118420: step 4600, loss = 0.89 (4190.9 examples/sec; 0.031 sec/batch)
2017-05-11 22:19:33.725212: step 4610, loss = 0.86 (4102.5 examples/sec; 0.031 sec/batch)
2017-05-11 22:19:34.296137: step 4620, loss = 0.81 (4213.7 examples/sec; 0.030 sec/batch)
2017-05-11 22:19:34.883450: step 4630, loss = 0.98 (4638.1 examples/sec; 0.028 sec/batch)
2017-05-11 22:19:35.454891: step 4640, loss = 0.91 (4038.2 examples/sec; 0.032 sec/batch)
2017-05-11 22:19:36.025417: step 4650, loss = 1.06 (4450.8 examples/sec; 0.029 sec/batch)
2017-05-11 22:19:36.596804: step 4660, loss = 0.96 (4551.2 examples/sec; 0.028 sec/batch)
2017-05-11 22:19:37.170385: step 4670, loss = 0.93 (4485.5 examples/sec; 0.029 sec/batch)
2017-05-11 22:19:37.745374: step 4680, loss = 0.91 (4403.3 examples/sec; 0.029 sec/batch)
2017-05-11 22:19:38.312269: step 4690, loss = 0.76 (4634.5 examples/sec; 0.028 sec/batch)
2017-05-11

2017-05-11 22:20:25.791242: step 5510, loss = 0.98 (4330.1 examples/sec; 0.030 sec/batch)
2017-05-11 22:20:26.353487: step 5520, loss = 1.00 (4336.1 examples/sec; 0.030 sec/batch)
2017-05-11 22:20:26.922947: step 5530, loss = 0.74 (4316.3 examples/sec; 0.030 sec/batch)
2017-05-11 22:20:27.488913: step 5540, loss = 0.81 (4534.7 examples/sec; 0.028 sec/batch)
2017-05-11 22:20:28.060935: step 5550, loss = 0.92 (4740.1 examples/sec; 0.027 sec/batch)
2017-05-11 22:20:28.622064: step 5560, loss = 0.82 (4704.2 examples/sec; 0.027 sec/batch)
2017-05-11 22:20:29.204671: step 5570, loss = 0.84 (4195.4 examples/sec; 0.031 sec/batch)
2017-05-11 22:20:29.778097: step 5580, loss = 0.81 (4295.3 examples/sec; 0.030 sec/batch)
2017-05-11 22:20:30.359651: step 5590, loss = 0.83 (4649.3 examples/sec; 0.028 sec/batch)
2017-05-11 22:20:30.930755: step 5600, loss = 0.91 (4390.1 examples/sec; 0.029 sec/batch)
2017-05-11 22:20:31.571691: step 5610, loss = 0.81 (4421.5 examples/sec; 0.029 sec/batch)
2017-05-11

2017-05-11 22:21:18.855787: step 6430, loss = 0.86 (4327.5 examples/sec; 0.030 sec/batch)
2017-05-11 22:21:19.440230: step 6440, loss = 0.73 (4347.0 examples/sec; 0.029 sec/batch)
2017-05-11 22:21:20.004287: step 6450, loss = 0.99 (4463.9 examples/sec; 0.029 sec/batch)
2017-05-11 22:21:20.582309: step 6460, loss = 0.76 (4036.9 examples/sec; 0.032 sec/batch)
2017-05-11 22:21:21.161133: step 6470, loss = 0.85 (4379.5 examples/sec; 0.029 sec/batch)
2017-05-11 22:21:21.735613: step 6480, loss = 0.95 (4435.8 examples/sec; 0.029 sec/batch)
2017-05-11 22:21:22.314111: step 6490, loss = 1.07 (4238.3 examples/sec; 0.030 sec/batch)
2017-05-11 22:21:22.889324: step 6500, loss = 0.94 (4435.6 examples/sec; 0.029 sec/batch)
2017-05-11 22:21:23.537371: step 6510, loss = 0.80 (4397.6 examples/sec; 0.029 sec/batch)
2017-05-11 22:21:24.112167: step 6520, loss = 0.80 (4556.0 examples/sec; 0.028 sec/batch)
2017-05-11 22:21:24.680272: step 6530, loss = 0.74 (5019.8 examples/sec; 0.025 sec/batch)
2017-05-11

2017-05-11 22:22:11.998073: step 7350, loss = 0.69 (4417.2 examples/sec; 0.029 sec/batch)
2017-05-11 22:22:12.572310: step 7360, loss = 0.84 (4634.2 examples/sec; 0.028 sec/batch)
2017-05-11 22:22:13.143786: step 7370, loss = 0.71 (4446.9 examples/sec; 0.029 sec/batch)
2017-05-11 22:22:13.704015: step 7380, loss = 0.94 (4751.4 examples/sec; 0.027 sec/batch)
2017-05-11 22:22:14.278106: step 7390, loss = 0.86 (4303.8 examples/sec; 0.030 sec/batch)
2017-05-11 22:22:14.847684: step 7400, loss = 0.84 (4543.7 examples/sec; 0.028 sec/batch)
2017-05-11 22:22:15.483778: step 7410, loss = 0.84 (4192.5 examples/sec; 0.031 sec/batch)
2017-05-11 22:22:16.051636: step 7420, loss = 0.76 (4564.6 examples/sec; 0.028 sec/batch)
2017-05-11 22:22:16.624542: step 7430, loss = 0.94 (4937.6 examples/sec; 0.026 sec/batch)
2017-05-11 22:22:17.194871: step 7440, loss = 0.63 (4369.7 examples/sec; 0.029 sec/batch)
2017-05-11 22:22:17.771639: step 7450, loss = 0.90 (4391.9 examples/sec; 0.029 sec/batch)
2017-05-11

2017-05-11 22:23:05.271594: step 8270, loss = 0.77 (4653.7 examples/sec; 0.028 sec/batch)
2017-05-11 22:23:05.852951: step 8280, loss = 0.83 (4606.2 examples/sec; 0.028 sec/batch)
2017-05-11 22:23:06.423387: step 8290, loss = 0.73 (4152.6 examples/sec; 0.031 sec/batch)
2017-05-11 22:23:06.999096: step 8300, loss = 0.87 (4042.5 examples/sec; 0.032 sec/batch)
2017-05-11 22:23:07.629426: step 8310, loss = 0.83 (3905.6 examples/sec; 0.033 sec/batch)
2017-05-11 22:23:08.197606: step 8320, loss = 0.74 (4211.3 examples/sec; 0.030 sec/batch)
2017-05-11 22:23:08.759340: step 8330, loss = 0.66 (4651.7 examples/sec; 0.028 sec/batch)
2017-05-11 22:23:09.339502: step 8340, loss = 0.77 (4215.0 examples/sec; 0.030 sec/batch)
2017-05-11 22:23:09.904625: step 8350, loss = 0.94 (4953.3 examples/sec; 0.026 sec/batch)
2017-05-11 22:23:10.497652: step 8360, loss = 0.94 (4465.7 examples/sec; 0.029 sec/batch)
2017-05-11 22:23:11.063760: step 8370, loss = 0.81 (4181.0 examples/sec; 0.031 sec/batch)
2017-05-11

2017-05-11 22:23:58.339654: step 9190, loss = 0.90 (4583.0 examples/sec; 0.028 sec/batch)
2017-05-11 22:23:58.915926: step 9200, loss = 0.84 (4153.9 examples/sec; 0.031 sec/batch)
2017-05-11 22:23:59.557460: step 9210, loss = 0.82 (4331.4 examples/sec; 0.030 sec/batch)
2017-05-11 22:24:00.121909: step 9220, loss = 0.83 (4876.3 examples/sec; 0.026 sec/batch)
2017-05-11 22:24:00.691143: step 9230, loss = 0.81 (4826.9 examples/sec; 0.027 sec/batch)
2017-05-11 22:24:01.275716: step 9240, loss = 0.72 (4415.5 examples/sec; 0.029 sec/batch)
2017-05-11 22:24:01.859803: step 9250, loss = 0.77 (4512.8 examples/sec; 0.028 sec/batch)
2017-05-11 22:24:02.423825: step 9260, loss = 0.79 (4834.7 examples/sec; 0.026 sec/batch)
2017-05-11 22:24:02.997932: step 9270, loss = 0.68 (4653.2 examples/sec; 0.028 sec/batch)
2017-05-11 22:24:03.574404: step 9280, loss = 0.85 (4629.9 examples/sec; 0.028 sec/batch)
2017-05-11 22:24:04.148371: step 9290, loss = 0.68 (4738.3 examples/sec; 0.027 sec/batch)
2017-05-11

2017-05-11 22:24:50.825903: step 10100, loss = 0.83 (4284.2 examples/sec; 0.030 sec/batch)
2017-05-11 22:24:51.450390: step 10110, loss = 0.80 (4549.8 examples/sec; 0.028 sec/batch)
2017-05-11 22:24:52.021302: step 10120, loss = 0.91 (4454.4 examples/sec; 0.029 sec/batch)
2017-05-11 22:24:52.596363: step 10130, loss = 0.73 (4473.4 examples/sec; 0.029 sec/batch)
2017-05-11 22:24:53.165487: step 10140, loss = 0.68 (4820.2 examples/sec; 0.027 sec/batch)
2017-05-11 22:24:53.745282: step 10150, loss = 0.67 (3997.5 examples/sec; 0.032 sec/batch)
2017-05-11 22:24:54.329625: step 10160, loss = 0.97 (4375.8 examples/sec; 0.029 sec/batch)
2017-05-11 22:24:54.900574: step 10170, loss = 0.76 (4477.5 examples/sec; 0.029 sec/batch)
2017-05-11 22:24:55.456532: step 10180, loss = 0.66 (4171.6 examples/sec; 0.031 sec/batch)
2017-05-11 22:24:56.010188: step 10190, loss = 0.78 (4570.7 examples/sec; 0.028 sec/batch)
2017-05-11 22:24:56.582527: step 10200, loss = 0.79 (4820.8 examples/sec; 0.027 sec/batch)

2017-05-11 22:25:43.520688: step 11010, loss = 0.76 (4294.4 examples/sec; 0.030 sec/batch)
2017-05-11 22:25:44.088003: step 11020, loss = 0.70 (4533.4 examples/sec; 0.028 sec/batch)
2017-05-11 22:25:44.662740: step 11030, loss = 0.74 (4103.0 examples/sec; 0.031 sec/batch)
2017-05-11 22:25:45.223462: step 11040, loss = 0.71 (4524.7 examples/sec; 0.028 sec/batch)
2017-05-11 22:25:45.787898: step 11050, loss = 0.77 (4595.8 examples/sec; 0.028 sec/batch)
2017-05-11 22:25:46.355994: step 11060, loss = 0.85 (4771.7 examples/sec; 0.027 sec/batch)
2017-05-11 22:25:46.937248: step 11070, loss = 0.83 (4500.8 examples/sec; 0.028 sec/batch)
2017-05-11 22:25:47.511242: step 11080, loss = 0.74 (4529.4 examples/sec; 0.028 sec/batch)
2017-05-11 22:25:48.076815: step 11090, loss = 0.84 (4738.1 examples/sec; 0.027 sec/batch)
2017-05-11 22:25:48.643847: step 11100, loss = 0.67 (4725.8 examples/sec; 0.027 sec/batch)
2017-05-11 22:25:49.270960: step 11110, loss = 0.81 (4188.4 examples/sec; 0.031 sec/batch)

2017-05-11 22:26:35.999336: step 11920, loss = 0.72 (4692.1 examples/sec; 0.027 sec/batch)
2017-05-11 22:26:36.582902: step 11930, loss = 0.72 (4051.8 examples/sec; 0.032 sec/batch)
2017-05-11 22:26:37.155279: step 11940, loss = 0.72 (4229.9 examples/sec; 0.030 sec/batch)
2017-05-11 22:26:37.708444: step 11950, loss = 0.62 (4884.8 examples/sec; 0.026 sec/batch)
2017-05-11 22:26:38.283583: step 11960, loss = 0.77 (4537.0 examples/sec; 0.028 sec/batch)
2017-05-11 22:26:38.852526: step 11970, loss = 0.67 (4696.0 examples/sec; 0.027 sec/batch)
2017-05-11 22:26:39.421427: step 11980, loss = 0.95 (4547.9 examples/sec; 0.028 sec/batch)
2017-05-11 22:26:39.989054: step 11990, loss = 0.77 (4438.3 examples/sec; 0.029 sec/batch)
2017-05-11 22:26:40.543586: step 12000, loss = 0.79 (4727.6 examples/sec; 0.027 sec/batch)
2017-05-11 22:26:41.202956: step 12010, loss = 0.80 (4725.5 examples/sec; 0.027 sec/batch)
2017-05-11 22:26:41.774231: step 12020, loss = 0.80 (4280.7 examples/sec; 0.030 sec/batch)

KeyboardInterrupt: 

In [None]:
# %load cifar10_eval
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Evaluation for CIFAR-10.

Accuracy:
cifar10_train.py achieves 83.0% accuracy after 100K steps (256 epochs
of data) as judged by cifar10_eval.py.

Speed:
On a single Tesla K40, cifar10_train.py processes a single batch of 128 images
in 0.25-0.35 sec (i.e. 350 - 600 images /sec). The model reaches ~86%
accuracy after 100K steps in 8 hours of training time.

Usage:
Please see the tutorial and website for how to download the CIFAR-10
data set, compile the program and train the model.

http://tensorflow.org/tutorials/deep_cnn/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from datetime import datetime
import math
import time

import numpy as np
import tensorflow as tf

import cifar10

FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval',
                           """Directory where to write event logs.""")
tf.app.flags.DEFINE_string('eval_data', 'test',
                           """Either 'test' or 'train_eval'.""")
tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train',
                           """Directory where to read model checkpoints.""")
tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5,
                            """How often to run the eval.""")
tf.app.flags.DEFINE_integer('num_examples', 10000,
                            """Number of examples to run.""")
tf.app.flags.DEFINE_boolean('run_once', False,
                         """Whether to run eval only once.""")


def eval_once(saver, summary_writer, top_k_op, summary_op):
  """Run Eval once.

  Args:
    saver: Saver.
    summary_writer: Summary writer.
    top_k_op: Top K op.
    summary_op: Summary op.
  """
  with tf.Session() as sess:
    ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
    if ckpt and ckpt.model_checkpoint_path:
      # Restores from checkpoint
      saver.restore(sess, ckpt.model_checkpoint_path)
      # Assuming model_checkpoint_path looks something like:
      #   /my-favorite-path/cifar10_train/model.ckpt-0,
      # extract global_step from it.
      global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
    else:
      print('No checkpoint file found')
      return

    # Start the queue runners.
    coord = tf.train.Coordinator()
    try:
      threads = []
      for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
        threads.extend(qr.create_threads(sess, coord=coord, daemon=True,
                                         start=True))

      num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size))
      true_count = 0  # Counts the number of correct predictions.
      total_sample_count = num_iter * FLAGS.batch_size
      step = 0
      while step < num_iter and not coord.should_stop():
        predictions = sess.run([top_k_op])
        true_count += np.sum(predictions)
        step += 1

      # Compute precision @ 1.
      precision = true_count / total_sample_count
      print('%s: precision @ 1 = %.3f' % (datetime.now(), precision))

      summary = tf.Summary()
      summary.ParseFromString(sess.run(summary_op))
      summary.value.add(tag='Precision @ 1', simple_value=precision)
      summary_writer.add_summary(summary, global_step)
    except Exception as e:  # pylint: disable=broad-except
      coord.request_stop(e)

    coord.request_stop()
    coord.join(threads, stop_grace_period_secs=10)


def evaluate():
  """Eval CIFAR-10 for a number of steps."""
  with tf.Graph().as_default() as g:
    # Get images and labels for CIFAR-10.
    eval_data = FLAGS.eval_data == 'test'
    images, labels = cifar10.inputs(eval_data=eval_data)

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate predictions.
    top_k_op = tf.nn.in_top_k(logits, labels, 1)

    # Restore the moving average version of the learned variables for eval.
    variable_averages = tf.train.ExponentialMovingAverage(
        cifar10.MOVING_AVERAGE_DECAY)
    variables_to_restore = variable_averages.variables_to_restore()
    saver = tf.train.Saver(variables_to_restore)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.summary.merge_all()

    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g)

    while True:
      eval_once(saver, summary_writer, top_k_op, summary_op)
      if FLAGS.run_once:
        break
      time.sleep(FLAGS.eval_interval_secs)


def main(argv=None):  # pylint: disable=unused-argument
  cifar10.maybe_download_and_extract()
  if tf.gfile.Exists(FLAGS.eval_dir):
    tf.gfile.DeleteRecursively(FLAGS.eval_dir)
  tf.gfile.MakeDirs(FLAGS.eval_dir)
  evaluate()


if __name__ == '__main__':
  tf.app.run()


INFO:tensorflow:Restoring parameters from /tmp/cifar10_train/model.ckpt-12000
2017-05-11 22:28:09.182949: precision @ 1 = 0.836


~ 84% accuracy at 12000 training steps (~12 minutes)
pretty fast!