In [1]:
import tensorflow as tf
import numpy as np
import tensorflow.keras.layers as nn
import os
from slim.preprocessing.cifarnet_preprocessing import preprocess_image

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
@tf.custom_gradient
def AlphaClip(x, alpha):
    output = tf.clip_by_value(x, 0, alpha)

    def grad_fn(dy):
        x_grad_mask = tf.cast(tf.logical_and(x >= 0, x <= alpha), tf.float32)
        alpha_grad_mask = tf.cast(x >= alpha, tf.float32)
        alpha_grad = tf.reduce_sum(dy * alpha_grad_mask)
        x_grad = dy * x_grad_mask
        
        return [x_grad, alpha_grad]

    return output, grad_fn

@tf.custom_gradient
def AlphaQuantize(x, alpha, bits):
    output = tf.round(x * ((2**bits - 1) / alpha)) * (alpha / (2**bits - 1))
    
    def grad_fn(dy):
        return [dy, None, None]
    
    return output, grad_fn

class PACT(tf.keras.layers.Layer):
    def __init__(self, quantize=False, bits=2.):
        super(PACT, self).__init__()      
        self.quantize = quantize
        self.bits = bits
        
    def build(self, input_shape):
        self.alpha = self.add_variable(
            'alpha', shape=[], 
            initializer=tf.keras.initializers.Constant([10.], dtype=tf.float32),
            #regularizer=tf.keras.regularizers.l2(0.01))
            regularizer = tf.keras.regularizers.l2(0.0002))
        
    def call(self, inputs):
        outputs = AlphaClip(inputs, self.alpha)
        if self.quantize:
            with tf.name_scope('QA'):
                outputs = AlphaQuantize(outputs, self.alpha, self.bits)
                tf.summary.histogram('activation', inputs)
                tf.summary.histogram('quantized_activation', outputs)
        return outputs
    
    def get_config(self):
        return {'quantize': self.quantize, 'bits': self.bits}
    
    def compute_output_shape(self, input_shape):
        return input_shape

In [3]:
def get_sawb_coefficients(bits):
    bits = int(bits)
    assert bits <= 4, "Currently only supports bitwidths up to 4."
    coefficient_dict = {1: [0., 1.], 2: [3.19, -2.14], 3: [7.40, -6.66], 4: [11.86, -11.68]}
    return coefficient_dict[bits]

@tf.custom_gradient
def SAWBQuantize(x, alpha, bits):
    # Clip between -alpha and alpha
    clipped = tf.clip_by_value(x, -alpha, alpha)
    # Rescale to [0, alpha]
    scaled = (clipped + alpha) / 2.
    # Quantize.
    quantized = tf.round(scaled * ((2**bits - 1) / alpha)) * (alpha / (2**bits - 1))
    # Rescale to negative range.
    output = (2 * quantized) - alpha
    
    def grad_fn(dy):
        return [dy, None, None]
    return output, grad_fn

class SAWBConv2D(tf.keras.layers.Conv2D):
    def __init__(self, *args, bits=2., **kwargs):
        super(SAWBConv2D, self).__init__(*args, **kwargs)
        self.bits = float(bits)
        self.c1, self.c2 = get_sawb_coefficients(bits)
        self.alpha = None
        
    def call(self, inputs):
        # Compute proper scale for our weights.
        alpha = self.c1 * tf.sqrt(tf.reduce_mean(self.kernel**2)) + self.c2 * tf.reduce_mean(tf.abs(self.kernel))
        self.alpha = alpha
        # Quantize kernel
        with tf.name_scope("QW"):
            q_kernel = SAWBQuantize(self.kernel, alpha, self.bits)
            tf.summary.histogram("weight", self.kernel)
            tf.summary.histogram("quantized_weight", q_kernel)
        
        # Invoke convolution
        outputs = self._convolution_op(inputs, q_kernel)
        
        if self.use_bias:
            if self.data_format == 'channels_first':
                outputs = tf.nn.bias_add(
                    outputs, self.bias, data_format='NCHW')
            else:
                outputs = tf.nn.bias_add(
                    outputs, self.bias, data_format='NHWC')

        if self.activation is not None:
            outputs = self.activation(outputs)

        return outputs

In [4]:
def preprocess(image, label):
    image = tf.image.resize_image_with_crop_or_pad(image, 40, 40)
    image = tf.random_crop(image, [32, 32, 3])
    image = tf.image.random_flip_left_right(image)
    return image, label

In [5]:
batch_size = 128
(train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.cifar10.load_data()

train_labels = train_labels.astype(np.int32)
test_labels = test_labels.astype(np.int32)

def train_transform(data, label):
    data = preprocess_image(data, 32, 32, is_training=True)
    return data, label

def test_transform(data, label):
    data = preprocess_image(data, 32, 32, is_training=False)
    return data, label

def train_input_fn():
    ds = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
    ds = ds.prefetch(batch_size)
    ds = ds.shuffle(10000)
    ds = ds.repeat()
    ds = ds.map(train_transform, num_parallel_calls=4)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)
    return ds

def test_input_fn():
    ds = tf.data.Dataset.from_tensor_slices((test_data, test_labels))
    ds = ds.map(test_transform, num_parallel_calls=4)
    ds = ds.batch(batch_size)
    ds = ds.repeat(1)
    return ds

In [6]:
cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}

class VGG(tf.keras.models.Model):
    def __init__(self, name, *args, **kwargs):
        super(VGG, self).__init__(*args, **kwargs)
        self.reg = tf.keras.regularizers.l2(0.0002)
        self.features = self._make_layers(cfg[name])
        self.flatten = nn.Flatten()
        self.classifier = nn.Dense(10, activation=None, kernel_regularizer=self.reg)
    
    def call(self, inputs, training=True):
        features = self.features(inputs, training=training)
        features = self.flatten(features)
        output = self.classifier(features)
        
        return output
    
    def _make_layers(self, cfg):
        layers = [nn.Conv2D(cfg[0], kernel_size=3, padding='same', kernel_regularizer=self.reg), nn.BatchNormalization(), nn.Activation('relu')]#PACT(quantize=True)]
        for x in cfg[1:]:
            if x == 'M':
                layers += [nn.MaxPool2D(pool_size=2, strides=2)]
            else:
                layers += [nn.Conv2D(x, kernel_size=3, padding='same'),
                           #SAWBConv2D(x, kernel_size=3, padding='same', kernel_regularizer=self.reg),
                           nn.BatchNormalization(),
                           #PACT(quantize=True)]
                           nn.Activation('relu')]
        layers += [nn.GlobalAveragePooling2D()]
        
        return tf.keras.models.Sequential(layers)

In [7]:
def model_fn(features, labels, mode):
    tf.summary.image('images', features, max_outputs=4)
    model = VGG('VGG11')
    
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    
    logits = model(features, training=is_training)
    
    predictions = {
        'classes': tf.argmax(input=logits, axis=1),
        'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
    }
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
    # Cacluate loss for train and eval.
    cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    reg_losses = model.get_losses_for(None) + model.get_losses_for(features)
    loss = cross_entropy
    if reg_losses:
        loss += tf.math.add_n(reg_losses)
    # Log losses.
    tf.summary.scalar('loss', loss)
    
    # Compute training metrics.
    accuracy = tf.metrics.accuracy(labels, predictions['classes'])
    metrics = {'accuracy': accuracy}
    
    # Set up eval mode spec.
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=metrics)
    
    # Otherwise we're doing training.
    global_step = tf.train.get_or_create_global_step()
    #optimizer = tf.train.AdamOptimizer()
    learning_rate = tf.train.exponential_decay(0.1, global_step, 60*NUM_STEPS, 0.1, staircase=True)
    tf.summary.scalar('lr', learning_rate)
    optimizer = tf .train.MomentumOptimizer(learning_rate, 0.9)
    update_ops = model.get_updates_for(features) + model.get_updates_for(None)
    with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(loss=loss, var_list=model.trainable_variables, global_step=global_step)
    # Keep track of training accuracy.
    tf.summary.scalar('train_accuracy', accuracy[1])
    
    # Log alpha histograms.
    for var in model.trainable_variables:
        if 'alpha' in var.name:
            tf.summary.histogram(var.name, var)
    
    return tf.estimator.EstimatorSpec(
        mode=mode, loss=loss, train_op=train_op, eval_metric_ops=metrics)

In [8]:
experiment_name = 'vgg_baseline_full'
model_path = os.path.join('/data', 'jwfromm', 'cifar_models', experiment_name)

classifier = tf.estimator.Estimator(
    model_fn=model_fn, model_dir=model_path)

NUM_STEPS = np.ceil(len(train_data)/batch_size)
EPOCHS = 200

train_spec = tf.estimator.TrainSpec(
    input_fn=train_input_fn, max_steps=NUM_STEPS * EPOCHS)
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/data/jwfromm/cifar_models/vgg_baseline_full', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f2f190e6048>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [9]:
tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)

INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /data/jwfromm/cifar_models/vgg_baseline_full/model.ckpt.
INFO:tensorflow:loss = 3.1842318, step = 0
INFO:tensorflow:global_step/sec: 33.9306
INFO:tensorflow:loss = 2.4029074, step = 100 (2.948 sec)
INFO:tensorflow:global_step/sec: 48.6628
INFO:tensorflow:loss = 2.373904, step = 200 (2.055 sec)
INFO:tensorflow:global_step/sec: 48.9751
INFO:tensorflow:loss = 2.3737335, ste

({'accuracy': 0.8874, 'loss': 0.5493577, 'global_step': 78200}, [])