In [1]:
import numpy as np
import tensorflow as tf
import os
#from riptide.models.cifar_resnet import cifar_resnet20_v1
#from riptide.binary.q_cifar_resnet import cifar_resnet20_v1
from riptide.models.resnetv1b import resnet18_v1b
from riptide.binary.HWGQ_funcs import Quantize, HWGQuantize, load_clusters, load_bits
from riptide.binary.HWGQ_layers import Config
from slim.preprocessing.inception_preprocessing import preprocess_image
from riptide.utils.datasets import imagerecord_dataset
from functools import partial
from official.resnet import resnet_run_loop
import multiprocessing

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
batch_size=128
nproc = multiprocessing.cpu_count()

train_preprocess = partial(preprocess_image, height=224, width=224, is_training=True)
def train_input_fn():
    return imagerecord_dataset(batch_size, is_training=True, preprocess=train_preprocess, num_workers=nproc)

val_preprocess = partial(preprocess_image, height=224, width=224, is_training=False)

def eval_input_fn():
    return imagerecord_dataset(batch_size, is_training=False, preprocess=val_preprocess, num_workers=nproc)

In [4]:
def cnn_model_fn(features, labels, mode):
    """Model function for CNN."""
    #clusters = tf.constant(load_clusters(2))
    #actQ = HWGQuantize
    #weightQ = Quantize
    #config = Config(actQ=actQ, weightQ=weightQ, clusters=clusters)
    #with config:
    #    model = ()
    # Generate summary for the input images.
    tf.summary.image('images', features, max_outputs=6)
    model = resnet18_v1b()
    logits = model(features)
    predictions = {
        # Generate predictions (for PREDICT and EVAL mode)
        "classes": tf.argmax(input=logits, axis=1),
        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
        # `logging_hook`.
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # Add weight decay
    weight_decay = 1e-4
    def exclude_batch_norm(name):
        return 'batch_normalization' not in name
    l2_loss = weight_decay * tf.add_n(
        # loss is computed using fp32 for numerical stability.
        [tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()
        if exclude_batch_norm(v.name)])
    loss = cross_entropy + l2_loss 
    
    # Log the model loss
    #tf.identity(cross_entropy, name='cross_entropy')
    #tf.identity(l2_loss, name='l2_loss')
    tf.summary.scalar('l2_loss', l2_loss)
    tf.summary.scalar('cross_entropy', cross_entropy)

    # Create metrics for training accuracy.
    accuracy = tf.metrics.accuracy(labels, predictions['classes'])
    accuracy_top_5 = tf.metrics.mean(tf.nn.in_top_k(predictions=logits,
                                                    targets=tf.reshape(labels, [-1]),
                                                    k=5,
                                                    name='top_5_op'))
    metrics = {'accuracy': accuracy,
               'accuracy_top_5': accuracy_top_5}
    
    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        #optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
        #lr = tf.train.exponential_decay(0.01, tf.train.get_global_step(), 20000, 0.1)
        global_step = tf.train.get_or_create_global_step()

        learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
            batch_size=batch_size, batch_denom=256, num_images=1281167,
            boundary_epochs=[30, 60, 80, 90], decay_rates=[1, 0.1, 0.01, 0.001, 1e-4],
            warmup=True, base_lr=.128)
        learning_rate = learning_rate_fn(global_step)
        # Create learning rate tensor for logging.
        #tf.identity(learning_rate, name='learning_rate')
        tf.summary.scalar('learning_rate', learning_rate)
        optimizer = tf.train.MomentumOptimizer(learning_rate = learning_rate, momentum=0.9)
        #optimizer = tf.train.AdamOptimizer(learning_rate = lr)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=global_step)                

        # Create a tensor named train_accuracy for logging purposes
        #tf.identity(accuracy[1], name='train_accuracy')
        #tf.identity(accuracy_top_5[1], name='train_accuracy_top_5')
        tf.summary.scalar('train_accuracy', accuracy[1])
        tf.summary.scalar('train_accuracy_top_5', accuracy_top_5[1])
        
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=metrics)

    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=metrics)

In [6]:
# Create the Estimator
#strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=3)
#run_config = tf.estimator.RunConfig(train_distribute=strategy)
# Set up runconfig to do less logging for hopefully better performance.
run_config = tf.estimator.RunConfig(
    save_summary_steps=2000, 
    log_step_count_steps=500, 
    save_checkpoints_secs=3600)
mnist_classifier = tf.estimator.Estimator(
    model_fn=cnn_model_fn, 
    model_dir="/data/jwfromm/models/resnet18_run3",
    config=run_config)
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=None)
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)

INFO:tensorflow:Using config: {'_save_checkpoints_secs': 3600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f788747f050>, '_model_dir': '/data/jwfromm/models/resnet18_run3', '_protocol': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_tf_random_seed': None, '_save_summary_steps': 2000, '_device_fn': None, '_experimental_distribute': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 500, '_evaluation_master': '', '_eval_distribute': None, '_train_distribute': None, '_master': ''}


I1030 23:04:04.706338 140155764057920 tf_logging.py:115] Using config: {'_save_checkpoints_secs': 3600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f788747f050>, '_model_dir': '/data/jwfromm/models/resnet18_run3', '_protocol': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_tf_random_seed': None, '_save_summary_steps': 2000, '_device_fn': None, '_experimental_distribute': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 500, '_evaluation_master': '', '_eval_distribute': None, '_train_distribute': None, '_master': ''}


In [7]:
tf.estimator.train_and_evaluate(mnist_classifier, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).


I1030 23:04:05.109523 140155764057920 tf_logging.py:115] Running training and evaluation locally (non-distributed).


INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 3600.


I1030 23:04:05.116319 140155764057920 tf_logging.py:115] Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 3600.


INFO:tensorflow:Calling model_fn.


I1030 23:04:05.319539 140155764057920 tf_logging.py:115] Calling model_fn.


INFO:tensorflow:Done calling model_fn.


I1030 23:04:08.903088 140155764057920 tf_logging.py:115] Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


I1030 23:04:08.907370 140155764057920 tf_logging.py:115] Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


I1030 23:04:09.830441 140155764057920 tf_logging.py:115] Graph was finalized.


INFO:tensorflow:Running local_init_op.


I1030 23:04:10.973629 140155764057920 tf_logging.py:115] Running local_init_op.


INFO:tensorflow:Done running local_init_op.


I1030 23:04:11.016076 140155764057920 tf_logging.py:115] Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 0 into /data/jwfromm/models/resnet18_run3/model.ckpt.


I1030 23:04:12.906497 140155764057920 tf_logging.py:115] Saving checkpoints for 0 into /data/jwfromm/models/resnet18_run3/model.ckpt.


INFO:tensorflow:loss = 7.155327, step = 0


I1030 23:04:19.934381 140155764057920 tf_logging.py:115] loss = 7.155327, step = 0


INFO:tensorflow:global_step/sec: 2.49608


I1030 23:07:40.248045 140155764057920 tf_logging.py:115] global_step/sec: 2.49608


INFO:tensorflow:loss = 7.147593, step = 500 (200.317 sec)


I1030 23:07:40.251480 140155764057920 tf_logging.py:115] loss = 7.147593, step = 500 (200.317 sec)


INFO:tensorflow:global_step/sec: 2.55063


I1030 23:10:56.277751 140155764057920 tf_logging.py:115] global_step/sec: 2.55063


INFO:tensorflow:loss = 7.144921, step = 1000 (196.030 sec)


I1030 23:10:56.281236 140155764057920 tf_logging.py:115] loss = 7.144921, step = 1000 (196.030 sec)


KeyboardInterrupt: 

In [None]:
eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)