In [0]:
#!pip install imgaug

In [0]:
!mkdir -p /content/competitions/human-protein-atlas-image-classification/output
import os
os.chdir('/content/competitions/human-protein-atlas-image-classification')


In [0]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf
import time

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.57.182.66:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 7851734508588266701),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 15193622113206846522),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_GPU:0, XLA_GPU, 17179869184, 1534203157942202998),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 4526036444191021926),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 16011821613375590284),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 13764882465600603963),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 3261906436674479014),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 10862596430537869516),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 7726

In [0]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage.io
from skimage.transform import resize
#from imgaug import augmenters as iaa

import PIL
from PIL import Image
import cv2
from sklearn.utils import class_weight, shuffle
import warnings
warnings.filterwarnings("ignore")
SIZE = 299
SEED = 777
THRESHOLD = 0.2

In [0]:
%matplotlib inline

## GCS access helpers ##
Courtesy of https://stackoverflow.com/a/52106361/7724174

These functions let us get data from GCS into our notebook.

In [0]:
import pandas as pd
# Load dataset info
#DIR = '../input/'
#DIR='gs://human-protein-atlas-kaggle/'
#data = dd.read_csv(DIR+'train.csv')
#data = data.compute()

DATA_DIR='gs://human-protein-atlas-kaggle/'

from tensorflow.python.lib.io import file_io
with file_io.FileIO(DATA_DIR+'train.csv', 'r') as f:
    data = pd.read_csv(f)

In [0]:
SHAPE = (299, 299, 3)
NUM_CLASSES=28
#epochs = 400;
epochs = 30
#batch_size = 256;
VAL_RATIO = .1;
DEBUG = False
channels = ["green", "blue", "red"]
lstmUnits=64

## Data input pipline ##
This isn't fully optimized yet, but it's good enough.

In [0]:
TF_DIR=DATA_DIR+'train'
DS_DIMS=[512,512]
NN_DIMS=[299,299]
REC_BUF_SIZE=453762 # This is approximate size for 512x512 images
NUM_PARALLEL_CALLS=8 # number of cores in the system
class HPADataset:
    def __init__(self, shards, aug=True):
        self.shards = shards
        self.aug = aug
    def input_fn(self, params):
        batch_size=params['batch_size']
        def _parse_function(example_proto):
            features = {}
            for c in channels:
                features["image/%s/filename"%c] = tf.FixedLenFeature((), tf.string, default_value="")
                features["image/%s/encoded"%c] = tf.FixedLenFeature((), tf.string, default_value="")
            features["image/label"] = tf.FixedLenFeature((NUM_CLASSES), tf.float32, default_value=[0]*NUM_CLASSES)
            parsed_features = tf.parse_single_example(example_proto, features)
            imgs=[]
            for c in channels:
                img=parsed_features['image/%s/encoded'%c]
                #print(img)
                img=tf.image.decode_png(img, channels=1)
                shape=tf.shape(img)
                #shape_print=tf.print(shape)
                img=tf.reshape(img, DS_DIMS)
                imgs.append(img)
            image=tf.stack(imgs, axis=-1, name='combine_channels')
            image=tf.image.resize_images(image, NN_DIMS)
            # For simplicity, we'll use imgaug with py_op here
            def augment(image):
                augment_img = iaa.Sequential([
                    iaa.OneOf([
                        iaa.Affine(rotate=0),
                        iaa.Affine(rotate=90),
                        iaa.Affine(rotate=180),
                        iaa.Affine(rotate=270),
                        iaa.Fliplr(0.5),
                        iaa.Flipud(0.5),
                    ])], random_order=True)

                image_aug = augment_img.augment_image(image)
                return image_aug
            if self.aug:
                image=tf.py_func(augment, [image], tf.float32, name='augment')
                image=tf.reshape(image, NN_DIMS+[len(channels)])
            image=tf.cast(image, tf.float32)
            image=image / 255.
            return image, parsed_features["image/label"]
        fnames=['{dir}/hpa_{w}x{h}_{num}.tfrecords'.format(dir=TF_DIR, w=DS_DIMS[0], h=DS_DIMS[1], num=shard) for shard in self.shards]
        dataset=tf.data.TFRecordDataset(fnames,
                                        buffer_size=REC_BUF_SIZE*2*len(self.shards),
                                        num_parallel_reads=len(self.shards))
        dataset=dataset.map(_parse_function, num_parallel_calls=NUM_PARALLEL_CALLS)
        dataset=dataset.shuffle(1000)
        dataset=dataset.prefetch(batch_size*8)
        dataset=dataset.batch(batch_size, drop_remainder=True)
        dataset=dataset.prefetch(2)
        if params['mode']!='predict':
            dataset=dataset.repeat()
            return dataset.make_one_shot_iterator().get_next()
        return dataset
#with tf.Graph().as_default():
#    test=HPADataset([1,2]).input_fn()
#    with tf.Session() as sess:
#        sess.run(test)

In [0]:
tg = HPADataset(range(8), False)
vg = HPADataset([8, 9], False)


## Model ##

What follows is our model based on pure tensorflow and the inception model present there. Large portions of this code are lifted from https://github.com/tensorflow/tpu/blob/master/models/experimental/inception/inception_v3.py


In [0]:
import tensorflow as tf
from tensorflow.contrib import summary
from tensorflow.contrib.framework.python.ops import arg_scope
from tensorflow.contrib.slim.nets import inception
from tensorflow.contrib.training.python.training import evaluation


In [0]:
### Some model settings
precision='float32'
log_device_placement=False
clear_update_collections=True
num_classes=NUM_CLASSES
display_tensors=True
use_tpu=True
train_batch_size=1024
geval_batch_size=1024
glearning_rate=0.165
learning_rate_decay=0.94
use_learning_rate_warmup=False
warmup_epochs=7
cold_epochs=2
learning_rate_decay_epochs=6
skip_host_call=True
goptimizer='RMS'
moving_average=True
MOVING_AVERAGE_DECAY = 0.995
# Batchnorm moving mean/variance parameters
BATCH_NORM_DECAY = 0.996
BATCH_NORM_EPSILON = 1e-3

WEIGHT_DECAY = 0.00004
RMSPROP_DECAY = 0.9                # Decay term for RMSProp.
RMSPROP_MOMENTUM = 0.9             # Momentum in RMSProp.
RMSPROP_EPSILON = 1.0              # Epsilon term for RMSProp.

min_depth=16
transpose_enabled=False
spatial_squeeze=True

_NUM_TRAIN_IMAGES = 24858
_NUM_EVAL_IMAGES = 6214
epochs=30
ITERATIONS=_NUM_TRAIN_IMAGES*epochs/train_batch_size
num_shards=8 # 8 in original..
#model_dir='/content/competitions/human-protein-atlas-image-classification/output'
model_dir=DATA_DIR+'output/'
save_checkpoints_secs=1000
save_summary_steps=100
eval_timeout=None
train_steps_per_eval=int(_NUM_TRAIN_IMAGES/train_batch_size) # essentially one epoch

dropout_keep_prob=0.8
train_steps=int(ITERATIONS)
print('Will train for {train} steps'.format(train=train_steps))

Will train for 728 steps


In [0]:

def tensor_transform_fn(data, perm):
  """Transpose function.

  This function is used to transpose an image tensor on the host and then
  perform an inverse transpose on the TPU. The transpose on the TPU gets
  effectively elided thus voiding any associated computational cost.

  NOTE: Eventually the compiler will be able to detect when this kind of
  operation may prove beneficial and perform these types of transformations
  implicitly, voiding the need for user intervention

  Args:
    data: Tensor to be transposed
    perm: New ordering of dimensions

  Returns:
    Transposed tensor
  """
  if transpose_enabled:
    return tf.transpose(data, perm)
  return data


In [0]:
from tensorflow.contrib import layers
from tensorflow.contrib.layers.python.layers import layers as layers_lib
from tensorflow.python.ops import array_ops

def inception_model_fn(features, labels, mode, params):
    """Inception v3 model using Estimator API."""
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    is_eval = (mode == tf.estimator.ModeKeys.EVAL)

    if isinstance(features, dict):
        features = features['feature']

    features = tensor_transform_fn(features, params['input_perm'])

    # This nested function allows us to avoid duplicating the logic which
    # builds the network, for different values of --precision.
    def build_inception_v3(final_endpoint='Mixed_7c', scope=None):
        with tf.variable_scope(
            scope, 'InceptionV3', [features, num_classes], reuse=None) as scope:
            with arg_scope(
              [layers_lib.batch_norm, layers_lib.dropout], is_training=is_training):
              net, end_points = inception.inception_v3_base(
                  features,
                  final_endpoint=final_endpoint
              )

              # Build our RNNs
              def build_rnn(endpoint):
                # 1. Unroll our logits horisonatlly and vertically
                #print("Input endpoint", endpoint)
                shape=endpoint.shape.as_list()
                shape=(shape[0], shape[1]*shape[2], shape[3])
                #print("target shape", shape)
                input_width=tf.reshape(endpoint, shape)
                input_height=tf.transpose(endpoint, (0, 2, 1, 3))
                input_height=tf.reshape(input_height, shape)
                #print("Input", input_height)
                lstmCellW = tf.contrib.rnn.LSTMCell(num_units=lstmUnits, use_peepholes=True)
                lstmCellH = tf.contrib.rnn.LSTMCell(num_units=lstmUnits, use_peepholes=True)
                wouts, wstate=tf.nn.dynamic_rnn(lstmCellW, input_width, scope='width', dtype=tf.float32, parallel_iterations=1024)
                houts, hstate=tf.nn.dynamic_rnn(lstmCellH, input_height, scope='height', dtype=tf.float32, parallel_iterations=1024)
                #print("RNN outs", wouts, houts)
                #print("Last of wouts", wouts[-1])
                #wouts=tf.unstack(wouts, axis=1)
                #houts=tf.unstack(houts, axis=1)
                #print("Unstacked", wouts)
                #print("Last?", wouts[-1])
                #wouts=wouts[-1]
                #houts=houts[-1]
                #logits=tf.concat([houts, wouts], name='Logits', axis=-1)
                #print(houts, wouts, logits)
                logits=tf.concat([wstate.c, hstate.c], name='Logits', axis=-1)
                return logits
              rnn_logits=[]
              #attachements=['Mixed_7b', 'Mixed_7a', 'Mixed_6e']
              attachements=['Mixed_7a']
              for att_pt in attachements:
                    with tf.variable_scope(att_pt+'_rnn'):
                        rnn_logits.append(build_rnn(end_points[att_pt]))
              #print(rnn_logits)
              rnn_logits=tf.concat(rnn_logits, name='FinalRNNLogits', axis=-1)

              depth = lambda d: max(d, min_depth)
              if 'Mixed_6e' in end_points and False :
                  # Auxiliary Head logits
                  with arg_scope(
                      [layers.conv2d, layers_lib.max_pool2d, layers_lib.avg_pool2d],
                      stride=1,
                      padding='SAME'):
                    aux_logits = end_points['Mixed_6e']
                    with tf.variable_scope('AuxLogits'):
                      aux_logits = layers_lib.avg_pool2d(
                          aux_logits, [5, 5],
                          stride=3,
                          padding='VALID',
                          scope='AvgPool_1a_5x5')
                      aux_logits = layers.conv2d(
                          aux_logits, depth(128), [1, 1], scope='Conv2d_1b_1x1')

                      # Shape of feature map before the final layer.
                      kernel_size = [5, 5]
                      aux_logits = layers.conv2d(
                          aux_logits,
                          depth(768),
                          kernel_size,
                          weights_initializer=tf.initializers.truncated_normal(stddev=0.01),
                          padding='VALID',
                          scope='Conv2d_2a_{}x{}'.format(*kernel_size))
                      aux_logits = layers.conv2d(
                          aux_logits,
                          num_classes, [1, 1],
                          activation_fn=None,
                          normalizer_fn=None,
                          weights_initializer=tf.initializers.truncated_normal(stddev=0.001),
                          scope='Conv2d_2b_1x1')
                      if spatial_squeeze:
                        aux_logits = array_ops.squeeze(
                            aux_logits, name='SpatialSqueeze')
                      end_points['AuxLogits'] = aux_logits
            # Final pooling and prediction
            with tf.variable_scope('Logits'):
                #kernel_size = [16, 16]
                net=rnn_logits
                end_points['rnn_logits']=net
                #print(net)
                #net=tf.layers.Flatten(net, name='flatten_rnns')
                net=layers_lib.dropout(net, keep_prob=dropout_keep_prob, is_training=is_training, scope="FinalDropout")
                end_points['dropout']=net
                net=tf.layers.dense(net, 1024, activation=tf.nn.relu, name='PreLogits')
                end_points['PreLogits']=net
                net=layers_lib.dropout(net, keep_prob=dropout_keep_prob, is_training=is_training, scope="FinalDropout")
                net=tf.layers.dense(net, num_classes, activation=tf.nn.sigmoid, name='Logits')
                # 28
                logits=net
                end_points['Logits'] = logits
                end_points['Predictions'] = logits #tf.nn.sigmoid(logits, name='Predictions')
                return logits, end_points

    def build_network(precision):
        if precision == 'bfloat16':
            with tf.contrib.tpu.bfloat16_scope():
                logits, end_points = build_inception_v3()
            logits = tf.cast(logits, tf.float32)
        elif precision == 'float32':
            logits, end_points = build_inception_v3()
        return logits, end_points

    if clear_update_collections:
        # updates_collections must be set to None in order to use fused batchnorm
        with arg_scope(inception.inception_v3_arg_scope(
            weight_decay=0.0,
            batch_norm_decay=BATCH_NORM_DECAY,
            batch_norm_epsilon=BATCH_NORM_EPSILON,
            updates_collections=None)):
            logits, end_points = build_network('float32')
    else:
        with arg_scope(inception.inception_v3_arg_scope(
            batch_norm_decay=BATCH_NORM_DECAY,
            batch_norm_epsilon=BATCH_NORM_EPSILON)):
            logits, end_points = build_network('float32')

    predictions = {
        'logits': logits,
        'classes': tf.math.greater(logits, 0.2),
        'probabilities': end_points['Predictions'],
        'rnn_logits': end_points['rnn_logits'],
        'dropout': end_points['dropout'],
        'prelogits': end_points['PreLogits'],
        'labels': labels,
    }

    if mode == tf.estimator.ModeKeys.PREDICT:
        return  tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })

    if mode == tf.estimator.ModeKeys.EVAL and display_tensors and (
        not use_tpu):
        with tf.control_dependencies([
            tf.Print(
                predictions['classes'], [predictions['classes']],
                summarize=geval_batch_size,
                message='prediction: ')
        ]):
            labels = tf.Print(
                labels, [labels], summarize=geval_batch_size, message='label: ')

    # in our case labels come pre-encoded
    one_hot_labels = labels #tf.one_hot(labels, num_classes, dtype=tf.int32)

    if 'AuxLogits' in end_points:
        tf.losses.sigmoid_cross_entropy(
            multi_class_labels=one_hot_labels,
            logits=tf.cast(end_points['AuxLogits'], tf.float32),
            weights=0.4,
            label_smoothing=0.1,
            scope='aux_loss')

    tf.losses.sigmoid_cross_entropy(
        multi_class_labels=one_hot_labels,
        logits=logits,
        #weights=1.0,
        #label_smoothing=0.1,
        reduction=tf.losses.Reduction.MEAN
    )

    losses = tf.add_n(tf.losses.get_losses())
    l2_loss = []
    for v in tf.trainable_variables():
        if 'BatchNorm' not in v.name and 'weights' in v.name:
            l2_loss.append(tf.nn.l2_loss(v))
    loss = losses + WEIGHT_DECAY * tf.add_n(l2_loss)

    initial_learning_rate = glearning_rate * train_batch_size / 256
    if use_learning_rate_warmup:
        # Adjust initial learning rate to match final warmup rate
        warmup_decay = learning_rate_decay**(
            (warmup_epochs + cold_epochs) /
            learning_rate_decay_epochs)
        adj_initial_learning_rate = initial_learning_rate * warmup_decay

    final_learning_rate = 0.0001 * initial_learning_rate

    host_call = None
    train_op = None
  
    if is_training:
        batches_per_epoch = _NUM_TRAIN_IMAGES / train_batch_size
        global_step = tf.train.get_or_create_global_step()
        current_epoch = tf.cast(
            (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32)

        learning_rate = tf.train.exponential_decay(
            learning_rate=initial_learning_rate,
            global_step=global_step,
            decay_steps=int(learning_rate_decay_epochs * batches_per_epoch),
            decay_rate=learning_rate_decay,
            staircase=True)

        if use_learning_rate_warmup:
            wlr = 0.1 * adj_initial_learning_rate
            wlr_height = tf.cast(
                0.9 * adj_initial_learning_rate /
                (warmup_epochs + learning_rate_decay_epochs - 1),
                tf.float32)
            epoch_offset = tf.cast(cold_epochs - 1, tf.int32)
            exp_decay_start = (warmup_epochs + cold_epochs +
                             learning_rate_decay_epochs)
            lin_inc_lr = tf.add(
                wlr, tf.multiply(
                    tf.cast(tf.subtract(current_epoch, epoch_offset), tf.float32),
                    wlr_height))
            learning_rate = tf.where(
                tf.greater_equal(current_epoch, cold_epochs),
                (tf.where(tf.greater_equal(current_epoch, exp_decay_start),
                          learning_rate, lin_inc_lr)),
                wlr)

        # Set a minimum boundary for the learning rate.
        learning_rate = tf.maximum(
            learning_rate, final_learning_rate, name='learning_rate')

        if goptimizer == 'sgd':
            tf.logging.info('Using SGD optimizer')
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        elif goptimizer == 'momentum':
            tf.logging.info('Using Momentum optimizer')
            optimizer = tf.train.MomentumOptimizer(
                learning_rate=learning_rate, momentum=0.9)
        elif goptimizer == 'RMS':
            tf.logging.info('Using RMS optimizer')
            optimizer = tf.train.RMSPropOptimizer(
                learning_rate,
                RMSPROP_DECAY,
                momentum=RMSPROP_MOMENTUM,
                epsilon=RMSPROP_EPSILON)
        else:
            tf.logging.fatal('Unknown optimizer:', optimizer)

        if use_tpu:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if params['warmup']:
            trainable_vars = tf.contrib.framework.get_model_variables()
            var_list=tf.contrib.framework.filter_variables(trainable_vars, exclude_patterns='.*Mixed_.*_rnn.*')
            trainable_vars = tf.contrib.framework.get_trainable_variables()
            var_list=tf.contrib.framework.filter_variables(trainable_vars, exclude_patterns=var_list)
            print('Training', var_list)
        else:
            var_list=None
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step=global_step, var_list=var_list)
        if moving_average:
            ema = tf.train.ExponentialMovingAverage(
                decay=MOVING_AVERAGE_DECAY, num_updates=global_step)
            variables_to_average = (
                tf.trainable_variables() + tf.moving_average_variables())
            with tf.control_dependencies([train_op]), tf.name_scope('moving_average'):
                train_op = ema.apply(variables_to_average)

        # To log the loss, current learning rate, and epoch for Tensorboard, the
        # summary op needs to be run on the host CPU via host_call. host_call
        # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
        # dimension. These Tensors are implicitly concatenated to
        # [params['batch_size']].
        gs_t = tf.reshape(global_step, [1])
        loss_t = tf.reshape(loss, [1])
        lr_t = tf.reshape(learning_rate, [1])
        ce_t = tf.reshape(current_epoch, [1])

        if not skip_host_call:
            def host_call_fn(gs, loss, lr, ce):
                """Training host call. Creates scalar summaries for training metrics.
                This function is executed on the CPU and should not directly reference
                any Tensors in the rest of the `model_fn`. To pass Tensors from the
                model to the `metric_fn`, provide them as part of the `host_call`. See
                https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
                for more information.
                Arguments should match the list of `Tensor` objects passed as the second
                element in the tuple passed to `host_call`.
                Args:
                  gs: `Tensor with shape `[batch]` for the global_step
                  loss: `Tensor` with shape `[batch]` for the training loss.
                  lr: `Tensor` with shape `[batch]` for the learning_rate.
                  ce: `Tensor` with shape `[batch]` for the current_epoch.
                Returns:
                  List of summary ops to run on the CPU host.
                """
                gs = gs[0]
                with summary.create_file_writer(model_dir).as_default():
                    with summary.always_record_summaries():
                        summary.scalar('loss', tf.reduce_mean(loss), step=gs)
                        summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs)
                        summary.scalar('current_epoch', tf.reduce_mean(ce), step=gs)

                    return summary.all_summary_ops()

            host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t])

    eval_metrics = None
    if is_eval:
        def metric_fn(labels, logits):
            """Evaluation metric function. Evaluates accuracy.
            This function is executed on the CPU and should not directly reference
            any Tensors in the rest of the `model_fn`. To pass Tensors from the model
            to the `metric_fn`, provide as part of the `eval_metrics`. See
            https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
            for more information.
            Arguments should match the list of `Tensor` objects passed as the second
            element in the tuple passed to `eval_metrics`.
            Args:
            labels: `Tensor` with shape `[batch, ]`.
            logits: `Tensor` with shape `[batch, num_classes]`.
            Returns:
            A dict of the metrics to return from evaluation.
            """
            probs=tf.nn.sigmoid(logits)
            predictions = tf.math.greater(probs, 0.2)
            recall = tf.metrics.recall(labels, predictions)
            precision=tf.metrics.precision(labels, predictions)
            f1=tf.contrib.metrics.f1_score(labels, probs)

            return {
              'recall': recall,
              'precision': precision,
              'f1': f1
            }

        eval_metrics = (metric_fn, [labels, logits])

    return tf.contrib.tpu.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        host_call=host_call,
        eval_metrics=eval_metrics)


In [0]:
class LoadEMAHook(tf.train.SessionRunHook):
  """Hook to load exponential moving averages into corresponding variables."""

  def __init__(self, model_dir):
    super(LoadEMAHook, self).__init__()
    self._model_dir = model_dir

  def begin(self):
    ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY)
    variables_to_restore = ema.variables_to_restore()
    self._load_ema = tf.contrib.framework.assign_from_checkpoint_fn(
        tf.train.latest_checkpoint(self._model_dir), variables_to_restore)

  def after_create_session(self, sess, coord):
    tf.logging.info('Reloading EMA...')
    self._load_ema(sess)


In [0]:
import tensorflow as tf
def do_it(mode):
  params = {
      'input_perm': [0, 1, 2, 3],
      'output_perm': [0, 1, 2, 3],
      'warmup': mode=='warmup',
      'mode': mode,
  }

  if mode == 'retrain':
    # wipe checkpoints
    files=tf.gfile.ListDirectory(model_dir)
    for f in files:
        fname=model_dir+f
        fs = tf.gfile.Stat(fname)
        if not fs.is_directory:
            tf.gfile.Remove(fname)

    do_it('warmup')
    do_it('train')
  if mode == 'warmup':
    mode = 'train'
  if mode == 'retrain_and_eval':
    # wipe checkpoints
    files=tf.gfile.ListDirectory(model_dir)
    for f in files:
        fname=model_dir+f
        fs = tf.gfile.Stat(fname)
        if not fs.is_directory:
            tf.gfile.Remove(fname)
    do_it('warmup')
    do_it('train_and_eval')

  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

  tf.logging.info('Precision: %s', precision)

  if mode == 'predict':
        batch_axis=None
  else:
    batch_axis = 0
    if transpose_enabled:
        params['input_perm'] = [3, 0, 1, 2]
        params['output_perm'] = [1, 2, 3, 0]
        batch_axis = 3
    batch_axis=(batch_axis, 0)

  eval_size = _NUM_EVAL_IMAGES
  eval_steps = eval_size // geval_batch_size

  iterations = (eval_steps if mode == 'eval' else save_summary_steps)

  eval_batch_size = (None if mode == 'train' else geval_batch_size)

  per_host_input_for_training = (num_shards <= 8 if mode == 'train' else True)

  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=model_dir,
      save_checkpoints_secs=save_checkpoints_secs,
      save_summary_steps=save_summary_steps,
      session_config=tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=log_device_placement),
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=iterations,
          num_shards=num_shards,
          per_host_input_for_training=per_host_input_for_training))

  trainable_vars = tf.contrib.framework.get_model_variables()
  #print(trainable_vars)
  skip_vars=['InceptionV3/AuxLogits/Conv2d_2b_1x1/weights']
  load_vars = tf.contrib.framework.filter_variables(trainable_vars, exclude_patterns=skip_vars)
  #print(load_vars)
  ws = tf.estimator.WarmStartSettings(
      ckpt_to_initialize_from=DATA_DIR+"pre-trained/inception_v3.ckpt",
      vars_to_warm_start=load_vars
  )
  inception_classifier = tf.contrib.tpu.TPUEstimator(
      model_fn=inception_model_fn,
      use_tpu=use_tpu,
      config=run_config,
      warm_start_from=ws,
      params=params,
      train_batch_size=train_batch_size,
      eval_batch_size=eval_batch_size,
      predict_batch_size=eval_batch_size,
      batch_axis=batch_axis)

  # Input pipelines are slightly different (with regards to shuffling and
  # preprocessing) between training and evaluation.
  use_bfloat16 = precision == 'bfloat16'
  imagenet_train = tg
  imagenet_eval = vg

  if moving_average:
    eval_hooks = [LoadEMAHook(model_dir)]
  else:
    eval_hooks = []

  if mode == 'eval':
    # Run evaluation when there is a new checkpoint
    for checkpoint in evaluation.checkpoints_iterator(
        model_dir, timeout=eval_timeout):
      tf.logging.info('Starting to evaluate.')
      try:
        start_timestamp = time.time()  # Includes compilation time
        eval_results = inception_classifier.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=eval_steps,
            hooks=eval_hooks,
            checkpoint_path=checkpoint)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info(
            'Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time)

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(checkpoint).split('-')[1])
        if current_step >= train_steps:
          tf.logging.info(
              'Evaluation finished after training step %d', current_step)
          break
      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info(
            'Checkpoint %s no longer exists, skipping checkpoint', checkpoint)

  elif mode == 'train_and_eval':
    for cycle in range(train_steps // train_steps_per_eval):
      tf.logging.info('Starting training cycle %d.' % cycle)
      inception_classifier.train(
          input_fn=imagenet_train.input_fn, steps=train_steps_per_eval)

      tf.logging.info('Starting evaluation cycle %d .' % cycle)
      eval_results = inception_classifier.evaluate(
          input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks)
      tf.logging.info('Evaluation results: %s' % eval_results)
  elif mode == 'predict':
    result=inception_classifier.predict(input_fn=imagenet_eval.input_fn)
    for i, r in enumerate(result):
        print(r)
        if i>50:
            break
        

  else:
    tf.logging.info('Starting training ...')
    if params['warmup']:
        steps = train_steps_per_eval*2 # ~2 epochs
        print('warming up for ', steps)
    else:
        steps = train_steps
        print('training for ')
    inception_classifier.train(
        input_fn=imagenet_train.input_fn, steps=steps)

  #if export_dir is not None:
  #  tf.logging.info('Starting to export model.')
  #  inception_classifier.export_saved_model(
  #      export_dir_base=export_dir,
  #      serving_input_receiver_fn=image_serving_input_fn)




In [0]:
import logging
logging.getLogger().setLevel(logging.INFO)
#tf.logging.set_verbosity(tf.logging.INFO)
do_it('retrain')

INFO:tensorflow:Precision: float32
INFO:tensorflow:Using config: {'_model_dir': 'gs://human-protein-atlas-kaggle/output/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 1000, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      value: "10.57.182.66:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f15ce4bc978>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': b'grpc://10.57.182.66:8470', '_evaluation_master': b'grpc://10.57.182.66:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100

In [0]:
#do_it('eval')

In [0]:
do_it('predict')

INFO:tensorflow:Precision: float32
INFO:tensorflow:Using config: {'_model_dir': 'gs://human-protein-atlas-kaggle/output/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 1000, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      value: "10.57.182.66:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f15ce4b1ba8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': b'grpc://10.57.182.66:8470', '_evaluation_master': b'grpc://10.57.182.66:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100

Exception ignored in: <generator object TPUEstimator.predict at 0x7f15ce4bed58>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2446, in predict
    rendezvous.raise_errors()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/tpu/error_handling.py", line 132, in raise_errors
    six.reraise(typ, value, traceback)
  File "/usr/local/lib/python3.6/dist-packages/six.py", line 693, in reraise
    raise value
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/tpu/error_handling.py", line 101, in catch_errors
    yield
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 442, in _run_infeed
    session.run(self._enqueue_ops)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 929, in run
    run_metadata_ptr)
  File "/usr/local/lib/python3.6/dist-packages/tensor