##Port model to TPUEstimator

### Imports

In [127]:
import os, re, math, json, sys, shutil, pprint, datetime, scipy

import numpy as np
import tensorflow as tf
from tqdm import tqdm
import PIL.Image, PIL.ImageFont, PIL.ImageDraw
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
from tensorflow.python.platform import tf_logging
print("Tensorflow version " + tf.__version__)

Tensorflow version 1.13.1


### Parameters

In [128]:
BATCH_SIZE = 64 #@param {type:"integer"}
BUCKET = 'gs://bucket-jupyter/' #@param {type:"string"}
DATA_PATH = 'data/data_4x4_13/'

assert re.search(r'gs://.+', BUCKET), 'You need a GCS bucket for your Tensorboard logs. Head to http://console.cloud.google.com/storage and create one.'


training_images_file   = BUCKET + DATA_PATH + 'train-images-idx3-ubyte'
training_labels_file   = BUCKET + DATA_PATH + 'train-labels-idx1-ubyte'
validation_images_file = BUCKET + DATA_PATH + 't10k-images-idx3-ubyte'
validation_labels_file = BUCKET + DATA_PATH + 't10k-labels-idx1-ubyte'

**IGNORE MEMORY ERRORS**

In [129]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

**TRAINING PARAMETERS**

In [130]:
epochs = 50
epsilon = 1e-9    
iter_routing = 3
mask_with_y = True
stddev = 0.01
regularization_scale = 0
gray=1          # grayscale image input
color=3         # color image input

#### MARGIN LOSS PARAMS

In [131]:
# For Margin Loss
m_plus = 0.9
m_minus = 0.1
lambda_val = 0.5

In [132]:
batch_trn = 8400
batch_val = 600
px=52

### Colab-only auth for this notebook and the TPU

In [133]:
IS_COLAB_BACKEND = 'COLAB_GPU' in os.environ  # this is always set on Colab, the value is 0 or 1 depending on GPU presence
if IS_COLAB_BACKEND:
  from google.colab import auth
  auth.authenticate_user() # Authenticates the backend and also the TPU using your credentials so that they can access your private GCS buckets

### TPU detection

In [134]:
try:
  tpu = tf.contrib.cluster_resolver.TPUClusterResolver() # Picks up a connected TPU on Google's Colab, ML Engine, Kubernetes and Deep Learning VMs accessed through the 'ctpu up' utility
  #tpu = tf.contrib.cluster_resolver.TPUClusterResolver('MY_TPU_NAME') # If auto-detection does not work, you can pass the name of the TPU explicitly (tip: on a VM created with "ctpu up" the TPU has the same name as the VM)
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
  USE_TPU = True
except ValueError:
  tpu = None
  print("Running on GPU or CPU")
  USE_TPU = False

Running on GPU or CPU


**MODEL PARAMS**

In [135]:
class Params(object):
    
    # Dataset Dimensions
    batch_sz=4                # max value allowed by system (recommended: 64 or 128)
    grids=4                   # 5 = 5x5 grid   
    lip_px=13                 # pixel width of each lip image
    pixels=grids*lip_px       # for image h & w 
    channel=gray              # image input (color or gray)
    num_threads=4             # system limit 
    num_labels = 10           # categories
    data_qt=9000              # total input items 
    val_qt=data_qt/15         # items used for validation
    test_qt=data_qt/15        # items used for test
    load_qt=data_qt-test_qt   # items loaded during training
    train_qt=load_qt-val_qt   # items used for training (not validation)
    height=pixels             # input image height
    width=pixels              # input image weight
    
    # Saving Frequency
    train_sum_freq=100
    val_sum_freq=500
    save_freq=3
     
    # Folder names concatenated per input dimensions
    logdir='log_{0}x{0}_{1}'.format(grids,lip_px)
    results='res_{0}x{0}_{1}'.format(grids,lip_px)
    dataset='data_{0}x{0}_{1}'.format(grids,lip_px)
    
    # Path names
    log_path = os.path.join('logs', logdir)        # path to dataset
    res_path = os.path.join('results', results)    # path to dataset
    data_path = os.path.join('data', dataset)      # path to dataset

#### SHUFFLE DATA

In [136]:
def get_batch_data(dataset, batch_size, num_threads):
    
    batch_sz = Params.batch_sz  
    
    trX, trY, num_tr_batch, valX, valY, num_val_batch = load_data(dataset, batch_sz)
    data_queues = tf.train.slice_input_producer([trX, trY])
    
    X, Y = tf.train.shuffle_batch(data_queues, num_threads=Params.num_threads, 
                                  batch_size=batch_sz, 
                                  capacity=batch_sz * 64, 
                                  min_after_dequeue=batch_sz * 32,
                                  allow_smaller_final_batch=False)

    return(X, Y)

#### LOAD DATASET

In [137]:
def load_data(dataset, batch_sz):
    
    # Method Params
    batch_sz = int(Params.batch_sz)
    data_path = Params.data_path
    grey = 1
     
    load_int=int(Params.load_qt)
    train_int=int(Params.train_qt)
    val_int=int(Params.val_qt)
    
    if (mode != tf.estimator.ModeKeys.PREDICT):
        fd = open(os.path.join(data_path, 'train-images-idx3-ubyte'))
        loaded = np.fromfile(file=fd, dtype=np.uint8)
        trainX = loaded[16:].reshape((load_int, 52, 52, 1)).astype(np.float32)

        fd = open(os.path.join(data_path, 'train-labels-idx1-ubyte'))
        loaded = np.fromfile(file=fd, dtype=np.uint8)
        trainY = loaded[8:].reshape((load_int)).astype(np.int32)

        trX = trainX[:train_int] / 255.
        trY = trainY[:train_int] 

        valX = trainX[train_int, ] / 255.
        valY = trainY[train_int:]

        num_tr_batch = train_int // batch_sz
        num_val_batch = val_int // batch_sz

        return trX, trY, num_tr_batch, valX, valY, num_val_batch
    else:
        fd = open(os.path.join(data_path, 't10k-images-idx3-ubyte'))
        loaded = np.fromfile(file=fd, dtype=np.uint8)
        teX = loaded[16:].reshape((Params.test_qt, Params.pixels, Params.pixels, grey)).astype(np.float)

        fd = open(os.path.join(data_path, 't10k-labels-idx1-ubyte'))
        loaded = np.fromfile(file=fd, dtype=np.uint8)
        teY = loaded[8:].reshape((Params.test_qt)).astype(np.int32)

        num_te_batch = Params.test_qt // batch_sz
        return teX / 255., teY, num_te_batch

In [138]:
def get_shape(inputs, name=None):
    
    batch_sz = Params.batch_sz
    
    name = "shape" if name is None else name
    with tf.name_scope(name):
        static_shape = inputs.get_shape().as_list()
        dynamic_shape = tf.shape(inputs)
        shape = []
        for i, dim in enumerate(static_shape):
            dim = dim if dim is not None else dynamic_shape[i]
            shape.append(dim)
        return(shape)

In [139]:
def routing(input, b_IJ, num_outputs=10, num_dims=16):
    
    ''' input: Tensor shaped [batch_sz, num_caps_l=1152, 1, length(u_i)=8, 1]
        num_caps_l: number of  layer l capsules.
        num_outputs: number of output capsules.
        num_dims: output capsule dimensions.
        v_j: vector of capsule j in layer l+1
        u_i: vector of capsule i in layer l   
        W: [1, num_caps_i, num_caps_j * len_v_j, len_u_j, 1]
     '''
    batch_sz = Params.batch_sz
    
    input_shape = get_shape(input)
    
    W = tf.get_variable('Weight', shape=[1, input_shape[1], num_dims * num_outputs] + input_shape[-2:],
                        dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=stddev))
    
    biases = tf.get_variable('bias', shape=(1, 1, num_outputs, num_dims, 1))

    # Element-wise multiply calculates u_hat and reduce_sum
    # reshape ops reduces time of tf.matmul operation
    
    # Matmul [a, b] x [b, c] = element-wise multiply [a*c, b] * [a*c, b]
    # reduce_sum at axis=1 and reshape to [a, c]
    
    input = tf.tile(input, [1, 1, num_dims * num_outputs, 1, 1])
    
    # input.get_shape() == [batch_sz, 1152, 160, 8, 1]
    # u_hat.get_shape() == [batch_sz, 1152, 10, 16, 1]
    
    u_hat = reduce_sum(W * input, axis=3, keepdims=True)
    u_hat = tf.reshape(u_hat, shape=[-1, input_shape[1], num_outputs, num_dims, 1])
    

    # u_hat_stopped = u_hat in forward propagation.  
    # Gradient not passed back u_hat_stopped to u_hat in backpropagation
    
    u_hat_stopped = tf.stop_gradient(u_hat, name='stop_gradient')

    for r_iter in range(iter_routing):
        with tf.variable_scope('iter_' + str(r_iter)):

            # => [batch_sz, 1152, 10, 1, 1]
            c_IJ = softmax(b_IJ, axis=2)

            # Final iter uses u_hat to receive gradients from graph
            if r_iter == iter_routing - 1:

                # weight u_hat with c_IJ, element-wise dimension => [batch_sz, 1152, 10, 16, 1]
                s_J = tf.multiply(c_IJ, u_hat)
                
                # sum second dimension = [batch_sz, 1, 10, 16, 1]
                # s_J.get_shape() == [batch_sz, 1, num_outputs, num_dims, 1]
                s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases

                # squash v_J.get_shape() == [batch_sz, 1, 10, 16, 1]
                v_J = squash(s_J)
                
            # No backpropagation here
            elif r_iter < iter_routing - 1:  
                s_J = tf.multiply(c_IJ, u_hat_stopped)
                s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases
                v_J = squash(s_J)

                # reshape & tile v_j [batch_sz ,1, 10, 16, 1] to [batch_sz, 1152, 10, 16, 1]
                # matmul in the last two dim: [16, 1].T x [16, 1] => [1, 1], reduce mean in the
                # batch_sz dim, resulting in [1, 1152, 10, 1, 1]
                v_J_tiled = tf.tile(v_J, [1, input_shape[1], 1, 1, 1])
                u_produce_v = reduce_sum(u_hat_stopped * v_J_tiled, axis=3, keepdims=True)
                
                # u_produce_v.get_shape() == [batch_sz, 1152, 10, 1, 1]
                # b_IJ += tf.reduce_sum(u_produce_v, axis=0, keep_dims=True)
                b_IJ += u_produce_v

    return(v_J)    # Returns: Tensor shaped [batch_sz, num_caps_l_plus_1, length(v_j)=16, 1]

### CapsLayer

In [140]:
class CapsLayer(object):
    ''' Capsule layer.
    Args:
        input: A 4-D tensor.
        num_outputs: number capsules in layer
        vec_len: integer, the length of the output vector of a capsule.
        layer_type: string, one of 'FC' or "CONV", the type of this layer,
            fully connected or convolution, for the future expansion capability
        with_routing: boolean, this capsule is routing with the lower-level layer capsule.

    Returns:
        A 4-D tensor.
    '''
    
    batch_sz = Params.batch_sz
    
    def __init__(self, num_outputs, vec_len, with_routing=True, layer_type='FC'):
        self.num_outputs = num_outputs
        self.vec_len = vec_len
        self.with_routing = with_routing
        self.layer_type = layer_type

    def __call__(self, input, kernel_size=None, stride=None):
        
        # Parameters 'kernel_size' and 'stride' will be used while layer_type = CONV
                
        batch_sz = Params.batch_sz
        
        if self.layer_type == 'CONV':
            self.kernel_size = kernel_size
            self.stride = stride
            if not self.with_routing:
                # the PrimaryCaps layer, a convolutional layer
                # input: [batch_sz, 20, 20, 256]
                # assert input.get_shape() == [batch_sz, 20, 20, 256]

                # NOTE: I can't find out any words from the paper whether the
                # PrimaryCap convolution does a ReLU activation or not before
                # squashing function, but experiment show that using ReLU get a
                # higher test accuracy. So, which one to use will be your choice
                capsules = tf.contrib.layers.conv2d(input, self.num_outputs * self.vec_len,
                                                    self.kernel_size, self.stride, padding="VALID",
                                                    activation_fn=tf.nn.relu)
                # capsules = tf.contrib.layers.conv2d(input, self.num_outputs * self.vec_len,
                #                                    self.kernel_size, self.stride,padding="VALID",
                #                                    activation_fn=None)
                capsules = tf.reshape(capsules, (batch_sz, -1, self.vec_len, 1))

                # return tensor with shape [batch_sz, 1152, 8, 1]
                capsules = squash(capsules)
                return(capsules)

        if self.layer_type == 'FC':
            if self.with_routing:
                # the DigitCaps layer is fully connected (FC) layer
                # Reshapes to [batch_sz, 1152, 1, 8, 1]
                self.input = tf.reshape(input, shape=(batch_sz, -1, 1, input.shape[-2].value, 1))

                with tf.variable_scope('routing'):
                    # b_IJ: [batch_sz, num_caps_l, num_caps_l_plus_1, 1, 1],
                    # about the reason of using 'batch_sz', see issue #21
                    b_IJ = tf.constant(np.zeros([batch_sz, input.shape[1].value, self.num_outputs, 1, 1], dtype=np.float32))
                    capsules = routing(self.input, b_IJ, num_outputs=self.num_outputs, num_dims=self.vec_len)
                    capsules = tf.squeeze(capsules, axis=1)

            return(capsules)

In [141]:
def squash(vector):
      
    '''
    vector: Tensor shaped [batch_sz, 1, num_caps, vec_len, 1] or [batch_sz, num_caps, vec_len, 1].
    
    Returns squashed tensor with 'vec_len' dimensions in same shape as vector.
    '''
    vec_squared_norm = reduce_sum(tf.square(vector), -2, keepdims=True)
    scalar_factor = vec_squared_norm / (1 + vec_squared_norm) / tf.sqrt(vec_squared_norm + epsilon)
    vec_squashed = scalar_factor * vector  # element-wise
    
    return(vec_squashed)

In [142]:
def save_images(imgs, size, save_path):
    
    batch_sz = Params.batch_sz
    
    '''
    Args:
        imgs: shaped [batch_sz, image_height, image_width]
        size: two integers  [image_height, image_width]
        save_path: path to save data
    '''
    
    imgs = (imgs + 1.) / 2    
    
    return(scipy.misc.imsave(save_path, mergeImgs(imgs, size)))

def mergeImgs(images, size):
    h, w = images.shape[1], images.shape[2]
    imgs = np.zeros((h * size[0], w * size[1], 3))
    for idx, image in enumerate(images):
        i = idx % size[1]
        j = idx // size[1]
        imgs[j * h:j * h + h, i * w:i * w + w, :] = image

    return imgs

In [143]:
def reduce_sum(input_tensor, axis=None, keepdims=False):
    
    batch_sz = Params.batch_sz
    
    try:
        return tf.reduce_sum(input_tensor, axis=axis, keepdims=keepdims)
    except:
        return tf.reduce_sum(input_tensor, axis=axis, keep_dims=keepdims)    # alt for version compatibility
    
def softmax(logits, axis=None):
    try:
        return tf.nn.softmax(logits, axis=axis)
    except:
        return tf.nn.softmax(logits, dim=axis)    # alt for version compatibility

## LipCapsule Model

In [144]:
class LipCapsule(object):
    
    height = Params.height
    width = Params.width
    num_label = Params.num_label
    channel=Params.channel

    def __init__(self, training=(mode == tf.estimator.ModeKeys.TRAIN), height=height, width=width, channels=channel, num_label=num_label):

        self.batch_sz = Params.batch_sz
        self.dataset = Params.dataset
        self.num_threads = Params.num_threads
        self.graph = tf.Graph()
        self.mode = tf.estimator.ModeKeys.TRAIN 
        self.step = tf.train.get_or_create_global_step()
        # features, labels, mode, params
        self.X, self.labels = get_batch_data(self.dataset, self.batch_sz, self.num_threads)
        self.dataset=Params.dataset
        # loss = tf.losses.softmax_cross_entropy(labels, logits)
        # Step increased per GLOBAL_BATCH_SIZE = 8 * BATCH_SIZE
        # Must adjust learning rate schedule accordingly
        lr = 0.0001 + tf.train.exponential_decay(0.01, step, 600//8, 1/math.e)
        # Wrap optimizer in a CrossShardOptimizer for multi-core training
        optimizer = tf.contrib.tpu.CrossShardOptimizer(tf.train.AdamOptimizer(lr))  
        
        # Running averages need updating after each batch.
        # train_op = tf.contrib.training.create_train_op(loss, optimizer)

        self.Y = tf.one_hot(self.labels, depth=self.num_label, axis=1, dtype=tf.float32)

        
        logits = tf.layers.Dense(10)(y)
        predictions = tf.nn.softmax(logits)
        classes = tf.math.argmax(predictions, axis=-1)

        if (mode == tf.estimator.ModeKeys.PREDICT):    
            self.X = tf.placeholder(tf.float32, shape=(batch_sz, self.height, 
                                                       self.width, 
                                                       self.channels))
            self.labels = tf.placeholder(tf.int32, shape=(batch_sz, ))
            self.Y = tf.reshape(self.labels, shape=(batch_sz, self.num_label, 1))
            self.build_arch()
            
        else:
            self.X, self.labels = get_batch_data(self.dataset, batch_sz, 
                                                 self.num_threads)
            self.Y = tf.one_hot(self.labels, depth=self.num_label, axis=1, 
                                dtype=tf.float32)

            self.build_arch()
            self.loss()
            self._summary()

            # t_vars = tf.trainable_variables()
            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            self.train_op = self.optimizer.minimize(self.total_loss, global_step=self.global_step)



    tf.logging.info('Setting up the main structure')

    def build_arch(self):
        
        batch_sz = Params.batch_sz
        
        with tf.variable_scope('Conv1_layer'):
            # Conv1, return tensor with shape [batch_sz, 20, 20, 256]
            conv1 = tf.contrib.layers.conv2d(self.X, num_outputs=256,
                                             kernel_size=9, stride=1,
                                             padding='VALID')

        # Primary Capsules layer returns tensor shaped [batch_sz, 1152, 8, 1]
        with tf.variable_scope('PrimaryCaps_layer'):
            primaryCaps = CapsLayer(num_outputs=32, vec_len=8, 
                                    with_routing=False, layer_type='CONV')
            caps1 = primaryCaps(conv1, kernel_size=9, stride=2)

        # DigitCaps layer, return shape [batch_sz, 10, 16, 1]
        with tf.variable_scope('DigitCaps_layer'):
            digitCaps = CapsLayer(num_outputs=self.num_label, vec_len=16, 
                                  with_routing=True, layer_type='FC')
            self.caps2 = digitCaps(caps1)

        with tf.variable_scope('Masking'):
            # calc ||v_c|| and softmax(||v_c||)
            # [batch_sz, 10, 16, 1] => [batch_sz, 10, 1, 1]
            self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2),
                                               axis=2, keepdims=True) + epsilon)
            self.softmax_v = softmax(self.v_length, axis=1)
            # self.softmax_v.get_shape() == [batch_sz, self.num_label, 1, 1]

            # picks out index of max softmax val of 10 caps
            # [batch_sz, 10, 1, 1] => [batch_sz] (index)
            self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1))
            # self.argmax_idx.get_shape() == [batch_sz, 1, 1]
            self.argmax_idx = tf.reshape(self.argmax_idx, shape=(batch_sz, ))

            # Not Masking
            if not mask_with_y:
                # 4-dimensional indexing process
                masked_v = []
                for batch_sz in range(batch_sz):
                    v = self.caps2[batch_sz][self.argmax_idx[batch_sz], :]
                    masked_v.append(tf.reshape(v, shape=(1, 1, 16, 1)))

                self.masked_v = tf.concat(masked_v, axis=0)
                assert self.masked_v.get_shape() == [batch_sz, 1, 16, 1]
                
            # Masking
            else:
                self.masked_v = tf.multiply(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, self.num_label, 1)))
                self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2), axis=2, 
                                                   keepdims=True) + epsilon)

        # MIRACL-VC1 images reconstructed with three (FC) layers
        # [batch_sz, 1, 16, 1] => [batch_sz, 16] => [batch_sz, 512]
        with tf.variable_scope('Decoder'):
            vector_j = tf.reshape(self.masked_v, shape=(batch_sz, -1))
            fc1 = tf.contrib.layers.fully_connected(vector_j, num_outputs=512)
            fc2 = tf.contrib.layers.fully_connected(fc1, num_outputs=1024)
            self.decoded = tf.contrib.layers.fully_connected(
                fc2, num_outputs=self.height * self.width * self.channels, 
                activation_fn=tf.sigmoid)

    def loss(self):
        
        # batch_sz = Params.batch_sz
        #  Margin loss [batch_sz, 10, 1, 1]
        # max_l = max(0, m_plus-||v_c||)^2
        max_l = tf.square(tf.maximum(0., m_plus - self.v_length))
        # max_r = max(0, ||v_c||-m_minus)^2
        max_r = tf.square(tf.maximum(0., self.v_length - m_minus))
        assert max_l.get_shape() == [batch_sz, self.num_label, 1, 1]

        # Reshapes [batch_sz, 10, 1, 1] => [batch_sz, 10]
        max_l = tf.reshape(max_l, shape=(batch_sz, -1))
        max_r = tf.reshape(max_r, shape=(batch_sz, -1))

        # calc T_c: [batch_sz, 10]
        T_c = self.Y    # T_c = Y
        # element-wise multiply [batch_sz, 10]
        L_c = T_c * max_l + lambda_val * (1 - T_c) * max_r

        self.margin_loss = tf.reduce_mean(tf.reduce_sum(L_c, axis=1))

        # Reconstruction loss
        orgin = tf.reshape(self.X, shape=(batch_sz, -1))
        squared = tf.square(self.decoded - orgin)
        self.reconstruction_err = tf.reduce_mean(squared)

        # Mean squared error calculated with reduce_mean (reconstruction loss)
        self.total_loss = self.margin_loss + regularization_scale * self.reconstruction_err

    # Summary
    def _summary(self):
        
        batch_sz = Params.batch_sz
        
        train_summary = []
        train_summary.append(tf.summary.scalar('train/margin_loss', self.margin_loss))
        train_summary.append(tf.summary.scalar('train/reconstruction_loss', self.reconstruction_err))
        train_summary.append(tf.summary.scalar('train/total_loss', self.total_loss))
        recon_img = tf.reshape(self.decoded, shape=(batch_sz, self.height, self.width, self.channels))
        train_summary.append(tf.summary.image('reconstruction_img', recon_img))
        self.train_summary = tf.summary.merge(train_summary)

        correct_prediction = tf.equal(tf.to_int32(self.labels), self.argmax_idx)
        self.accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32))
        
model_fn=LipCapsule()


AttributeError: type object 'Params' has no attribute 'num_label'

### Estimator model

In [145]:
# Transforms data to be sent to model input_fn. 
# Produces a Tensorflow graph to be prepended to model graph. 

def serving_input_fn():
    # placeholder for the data received by the API (already parsed, no JSON decoding necessary,
    # but the JSON must contain one or multiple 'image' key(s) with px x px greyscale images  as content.)
    
    inputs = {"serving_input": tf.placeholder(tf.float32, [None, px, px])}  
    # should match shape of JSON
    features = inputs['serving_input']  # no transformation needed
    
    return tf.estimator.export.TensorServingInputReceiver(features, inputs) # needed by model_fn
  

In [156]:
# TPU REFACTORING: model_fn must have a params argument. 
# TPUEstimator passes batch_size and use_tpu into it

def model_fn(features, labels, mode, params):
  
  training = (mode == tf.estimator.ModeKeys.TRAIN)

  x = features
  y = tf.reshape(x, [-1, px, px, 1])

  y = tf.layers.Conv2D(filters=6, kernel_size=3, padding='same', use_bias=False)(y) # no bias necessary before batch norm
  y = tf.layers.BatchNormalization(scale=False, center=True)(y, training=mode) # no batch norm scaling necessary before "relu"
  y = tf.nn.relu(y) # activation after batch norm

  y = tf.layers.Conv2D(filters=12, kernel_size=6, padding='same', use_bias=False, strides=2)(y)
  y = tf.layers.BatchNormalization(scale=False, center=True)(y, training=mode)
  y = tf.nn.relu(y)

  y = tf.layers.Conv2D(filters=24, kernel_size=6, padding='same', use_bias=False, strides=2)(y)
  y = tf.layers.BatchNormalization(scale=False, center=True)(y, training=mode)
  y = tf.nn.relu(y)

  y = tf.layers.Flatten()(y)
  y = tf.nn.relu(y)
  
  logits = tf.layers.Dense(10)(y)
  predictions = tf.nn.softmax(logits)
  classes = tf.math.argmax(predictions, axis=-1)

In [157]:
def save_to():
    res_path = os.path.join('results', results)
    batch_sz = Params.batch_sz        
    loss = res_path + '/loss.csv'
    train_acc = res_path + '/train_acc.csv'
    val_acc = res_path + '/val_acc.csv'
    fd_train_acc = open(train_acc, 'w')
    fd_train_acc.write('step,train_acc\n')
    fd_loss = open(loss, 'w')
    fd_loss.write('step,loss\n')
    fd_val_acc = open(val_acc, 'w')
    fd_val_acc.write('step,val_acc\n')
    return(fd_train_acc, fd_loss, fd_val_acc)

In [158]:
def train(model, estimator, num_labels):
    
    batch_sz=int(Params.batch_sz)
    val_sum_freq=int(Params.val_sum_freq)
    dataset=str(Params.dataset)
    
    trX, trY, num_tr_batch, valX, valY, num_val_batch = load_data(dataset, batch_sz)
    Y = valY[:num_val_batch * batch_sz].reshape((-1, 1))

    fd_train_acc, fd_loss, fd_val_acc = save_to()
    
    with supervisor.managed_session() as sess:
        for epoch in range(epochs):
            print("\n Training.  Epoch %d/%d:" % (epoch, epochs))
            if supervisor.should_stop():
                print('\n Supervisor stoped! \n ')
                break
            for step in tqdm(range(num_tr_batch), total=num_tr_batch, ncols=50, leave=False, unit='b'):
                start = step * batch_sz
                end = start + batch_sz
                global_step = epoch * num_tr_batch + step

                if global_step % Params.train_sum_freq == 0:
                    _, loss, train_acc, summary_str = sess.run([model.train_op, model.total_loss, 
                                                                model.accuracy, model.train_summary])
                    assert not np.isnan(loss), 'LOSS NAN ERROR'
                    
                    supervisor.summary_writer.add_summary(summary_str, global_step)

                    fd_loss.write(str(global_step) + ',' + str(loss) + "\n")
                    fd_loss.flush()
                    fd_train_acc.write(str(global_step) + ',' + str(train_acc / batch_sz) + "\n")
                    fd_train_acc.flush()
                else:
                    sess.run(model.train_op)

                if val_sum_freq != 0 and (global_step) % val_sum_freq == 0:
                    val_acc = 0
                    for i in range(num_val_batch):
                        start = i * batch_sz
                        end = start + batch_sz
                        acc = sess.run(model.accuracy, {model.X: valX[start:end], model.labels: valY[start:end]})
                        val_acc += acc
                    val_acc = val_acc / (batch_sz * num_val_batch)
                    fd_val_acc.write(str(global_step) + ',' + str(val_acc) + '\n')
                    fd_val_acc.flush()

            if (epoch + 1) % save_freq == 0:
                supervisor.saver.save(sess, log_path + '/model_epoch_%04d_step_%02d' % (epoch, global_step))

        fd_val_acc.close()
        fd_train_acc.close()
        fd_loss.close()

### Train and validate the model on TPU

In [159]:
def read_label(tf_bytestring):
    label = tf.decode_raw(tf_bytestring, tf.uint8)
    label = tf.reshape(label, [])
    return label
  
def read_image(tf_bytestring):
    image = tf.decode_raw(tf_bytestring, tf.uint8)
    image = tf.cast(image, tf.float32)/256.0
    image = tf.reshape(image, [px*px])
    return image
  
def load_dataset(image_file, label_file):
    imagedataset = tf.data.FixedLengthRecordDataset(image_file, px*px, header_bytes=16)
    imagedataset = imagedataset.map(read_image, num_parallel_calls=16)
    labelsdataset = tf.data.FixedLengthRecordDataset(label_file, 1, header_bytes=8)
    labelsdataset = labelsdataset.map(read_label, num_parallel_calls=16)
    dataset = tf.data.Dataset.zip((imagedataset, labelsdataset))
    return dataset 
  
def get_training_dataset(image_file, label_file, batch_size):
    dataset = load_dataset(image_file, label_file)
    dataset = dataset.cache()  # Cached in RAM
    dataset = dataset.shuffle(batch_val, reshuffle_each_iteration=True)
    dataset = dataset.repeat() # Mandatory for TPU  
    dataset = dataset.batch(batch_size, drop_remainder=True) # Important on TPU
    dataset = dataset.prefetch(-1)  # Prefetches next batch while training  (-1: prefetch buffer size)
    return dataset

# def get_validation_dataset(image_file, label_file):
def get_validation_dataset(image_file, label_file, batch_size):
    dataset = load_dataset(image_file, label_file)
    dataset = dataset.cache() # cached in RAM

    # dataset = dataset.batch(batch_test, drop_remainder=True) 
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.repeat() # Mandatory for TPU for now
    return dataset

# instantiate the datasets
training_dataset = get_training_dataset(training_images_file, training_labels_file, BATCH_SIZE)
validation_dataset = get_validation_dataset(validation_images_file, validation_labels_file, batch_val)

# For TPU, we will need a function that returns the dataset

# TPU REFACTORING: input_fn's must have a params argument though which TPUEstimator passes params['batch_size']
# training_input_fn = lambda: get_training_dataset(training_images_file, training_labels_file, BATCH_SIZE)
# validation_input_fn = lambda: get_validation_dataset(validation_images_file, validation_labels_file)
training_input_fn = lambda params: get_training_dataset(training_images_file, training_labels_file, params['batch_size'])
validation_input_fn = lambda params: get_validation_dataset(validation_images_file, validation_labels_file, params['batch_size'])

In [160]:
EPOCHS = 50

# uses all 8 cores
GLOBAL_BATCH_SIZE = BATCH_SIZE * 8

# TPUEstimator increments step once per GLOBAL_BATCH_SIZE
steps_per_epoch = batch_trn // GLOBAL_BATCH_SIZE  # batch_trn images in training dataset

MODEL_EXPORT_NAME = "model_export"  # name for exporting saved model

# Mltiple training steps before reporting
TPU_ITERATIONS_PER_LOOP = steps_per_epoch # report back after each epoch

tf_logging.set_verbosity(tf_logging.INFO)
now = datetime.datetime.now()
MODEL_DIR = BUCKET+"lipjobs_1/job" + "-{}-{:02d}-{:02d}-{:02d}:{:02d}:{:02d}".format(now.year, now.month, now.day, now.hour, now.minute, now.second)

# RunConfig changed
training_config = tf.contrib.tpu.RunConfig(
    cluster=tpu, model_dir=MODEL_DIR,
    tpu_config=tf.contrib.tpu.TPUConfig(TPU_ITERATIONS_PER_LOOP))
   
# call export_savedmodel after training
    
estimator = tf.contrib.tpu.TPUEstimator(
    model_fn=model_fn, model_dir=MODEL_DIR,
    train_batch_size=GLOBAL_BATCH_SIZE,
    eval_batch_size=batch_val,  

    config=training_config, use_tpu=USE_TPU, export_to_tpu=False) # supported on ML Engine

estimator.train(training_input_fn, steps=steps_per_epoch*EPOCHS)
estimator.evaluate(input_fn=validation_input_fn, steps=1)
  
# call export_savedmodel after training
estimator.export_savedmodel(os.path.join(MODEL_DIR, MODEL_EXPORT_NAME), serving_input_fn)
tf_logging.set_verbosity(tf_logging.WARN)

INFO:tensorflow:Using config: {'_model_dir': 'gs://bucket-jupyter/lipjobs_1/job-2019-08-10-00:52:34', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000025947F9D2B0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=16, num_shards=None, num_cores_per_replica=None, per_host_input_for_training=2, tpu_job_name

TypeError: `pred` must be a Tensor, or a Python bool, or 1 or 0. Found instead: train

In [None]:
def main(_):
    
    model = LipCapsule() 
    num_labels = int(Params.num_labels)

    tf.logging.info('  \n LOADING GRAPH ...  \n ')
    
    sv = tf.train.Supervisor(graph=model.graph, logdir=Params.log_path, save_model_secs=0)
    
    tf.logging.info(' \n GRAPH LOADED.  \n ') 
    
    if (mode != tf.estimator.ModeKeys.PREDICT): 
        tf.logging.info('\n TRAINING INITATED ... \n')
        train(model, sv, num_labels)
        tf.logging.info('\n TRAINING COMPLETE. \n')
        
    else:
        tf.logging.info('\n TEST INITATED ... \n')
        evaluation(model, sv, num_labels)
        tf.logging.info('\n TEST COMPLETE. \n')
        loss = tf.losses.softmax_cross_entropy(labels, logits)
    
        # Default Estimator summaries appear in Tensorboard.
        # metrics_fn needed for TPU
        # metrics = {'accuracy': tf.metrics.accuracy(classes, tf.math.argmax(labels, axis=-1))}
        metric_fn = lambda classes, labels: {'accuracy': tf.metrics.accuracy(classes, tf.math.argmax(labels, axis=-1))}
        tpu_metrics = (metric_fn, [classes, labels])  # metric_fn and list of arguments

            else:
                loss = train_op = metrics = tpu_metrics = None  
                return tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    predictions={"predictions": predictions, "classes": classes},
                    loss=loss, train_op=train_op,
                    # metrics_fn must be passed to eval_metrics field 
                    eval_metrics = tpu_metrics
                )
        
if __name__ == "__main__":
    tf.app.run()

## Deploy the trained model to ML Engine

Push your trained model to production on ML Engine for a serverless, autoscaled, REST API experience.

You will need a GCS bucket and a GCP project for this.
Models deployed on ML Engine autoscale to zero if not used. There will be no ML Engine charges after you are done testing.
Google Cloud Storage incurs charges. Empty the bucket after deployment if you want to avoid these. Once the model is deployed, the bucket is not useful anymore.

### Configuration

In [162]:
PROJECT = "logical-cubist-249306" #@param {type:"string"}
NEW_MODEL = True #@param {type:"boolean"}
MODEL_NAME = "estimator_tpu_lipcapsule" #@param {type:"string"}
MODEL_VERSION = "v1" #@param {type:"string"}

assert PROJECT, 'GCP project'

#TPU REFACTORING: TPUEstimator does not create the 'export' subfolder
#export_path = os.path.join(MODEL_DIR, 'export', MODEL_EXPORT_NAME)
export_path = os.path.join(MODEL_DIR, MODEL_EXPORT_NAME)
last_export = sorted(tf.gfile.ListDirectory(export_path))[-1]
export_path = os.path.join(export_path, last_export)
print('Saved model directory found: ', export_path)

NotFoundError: Could not find directory

### Deploy the model
This uses the command-line interface. You can do the same thing through the ML Engine UI at https://console.cloud.google.com/mlengine/models


In [163]:
# Create the model
if NEW_MODEL:
  !gcloud ml-engine models create {MODEL_NAME} --project={PROJECT} --regions=us-central1

'gcloud' is not recognized as an internal or external command,
operable program or batch file.


In [164]:
# Create a version of this model (you can add --async at the end of the line to make this call non blocking)
# Additional config flags are available: https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions
# You can also deploy a model that is stored locally by providing a --staging-bucket=... parameter
!echo "Deployment takes a couple of minutes. You can watch your deployment here: https://console.cloud.google.com/mlengine/models/{MODEL_NAME}"
!gcloud ml-engine versions create {MODEL_VERSION} --model={MODEL_NAME} --origin={export_path} --project={PROJECT} --runtime-version=1.10

"Deployment takes a couple of minutes. You can watch your deployment here: https://console.cloud.google.com/mlengine/models/estimator_tpu_lipcapsule"


'gcloud' is not recognized as an internal or external command,
operable program or batch file.


### Test the deployed model
Your model is now available as a REST API. Let us try to call it. The cells below use the "gcloud ml-engine"
command line tool but any tool that can send a JSON payload to a REST endpoint will work.

In [0]:
training = (mode == tf.estimator.ModeKeys.PREDICT)

def evaluation(model, supervisor, labels):
    
    batch_sz = Params.batch_sz
    teX, teY, num_te_batch = load_data(dataset, batch_sz)
    
    fd_test_acc = save_test()
    
    with supervisor.managed_session() as sess:
        supervisor.saver.restore(sess, tf.train.latest_checkpoint(log_path))
        tf.logging.info(' \n MODEL RESTORED. \n')

        test_acc = 0
        
        for i in tqdm(range(num_te_batch), total=num_te_batch, ncols=70, leave=False, unit='b'):
            start = i * batch_sz
            end = start + batch_sz
            acc = sess.run(model.accuracy, {model.X: teX[start:end], model.labels: teY[start:end]})
            test_acc += acc
            
        test_acc = test_acc / (batch_sz * num_te_batch)
        print(str(test_acc))
        fd_test_acc.write(str(test_acc))
        
        fd_test_acc.close()
        
        print('  \n TEST ACCURACY SAVED TO ' + results + '/test_acc.csv  \n ')

In [0]:
def save_test():
    
    test_acc = results + '/test_acc.csv'
    
    # if os.path.exists(test_acc):
    #   os.remove(test_acc)
        
    fd_test_acc = open(test_acc, 'w')
    fd_test_acc.write('test_acc\n')
    
    return(fd_test_acc)

## License



---


author: Martin Gorner<br>
twitter: @martin_gorner


---


Copyright 2018 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


---


This is not an official Google product but sample code provided for an educational purpose
