# LipCapsule

        License: Apache-2.0 
                
        Code adapted for dataset, TPU and Jupyter by Michael E Cruz and Oliver A Ellison (2019)
        Email: mecruz@bu.edu, aurelius@bu.edu
        Repo: github.com/lipcapsule/TPU
        
        Adapted from code by Huadong Liao (2017)
        Repo: github.com/naturomics/CapsNet-Tensorflow
        
        Methods for data preprocessing of MIRACL-VC1 dataset based on,
        "Lip reading using CNN and LSTM" (2016) by Amit Garg, Jonathan Noyola, and Sameep Bagadia.
        Stanford Research Paper: http://cs231n.stanford.edu/reports/2016/pdfs/217_Report.pdf

#### IMPORTS -- *GPU or TPU*

In [11]:
import os
import sys
import scipy
import numpy as np
import tensorflow as tf
from tqdm import tqdm

#### UGRADE GOOGLE API -- *TPU only*

In [None]:
!pip3 install --upgrade google-api-python-client
!pip3 install --upgrade oauth2client
# !pip install --upgrade google-api-python-client
# !pip install --upgrade oauth2client

_GOOGLE_API_CLIENT_INSTALLED = True

#### IMPORTS -- *TPU only*

In [None]:
from tensorflow.contrib import tpu
from tensorflow.python.util import compat
from tensorflow.contrib.tpu.python.tpu import tpu_function
from tensorflow.contrib.cluster_resolver import TPUClusterResolver
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import get_accelerator_devices
from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top

#### ENVIRONMENT VARIABLES -- *TPU only*

In [None]:
JOB_NAME=lips_4
STAGING_BUCKET=gs://bucket-4x4
REGION=us-central1
DATA_DIR=gs://bucket-4x4/data/miracl
OUTPUT_PATH=gs://bucket-4x4/logdir

#### CONNECT WITH TPU SERVER  -- *TPU only*

In [None]:
tpu_cluster = TPUClusterResolver(
    tpu=['TPU_NAME']).get_master()

tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
    FLAGS.tpu,
    zone=FLAGS.tpu_zone,
    project=FLAGS.gcp_project)

config = tpu_config.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=FLAGS.model_dir,
    save_checkpoints_steps=max(600, FLAGS.iterations_per_loop),
    tpu_config=tpu_config.TPUConfig(
        iterations_per_loop=FLAGS.iterations_per_loop,
        num_shards=FLAGS.num_cores,
        per_host_input_for_training=tpu_config.InputPipelineConfig.PER_HOST_V2))

# Get the TPU's location
with tf.Session(tpu_cluster) as sess:
    sess.run(tpu.initialize_system())

#### SUBMIT TRAINING JOB -- *TPU only*

In [None]:
gcloud ai-platform jobs submit training $JOB_NAME \
        --staging-bucket $STAGING_BUCKET \
        --runtime-version 1.13 \
        --scale-tier BASIC_TPU \
        --module-name lipcaps \
        --package-path STAGING_BUCKET/ \
        --region $REGION \
        -- \
        --data_dir=$DATA_DIR \
        --model_dir=$OUTPUT_PATH

#### FLAGS AND RUNCONFIG -- *TPU only*

In [None]:
class FLAGS(object):
    use_tpu=True
    tpu_name='TPU_NAME'
    model_dir='bucket-4x4'
    iterations = 100    # Steps before returning control
    num_shards = 8    # TPU has 8 shards

if FLAGS.use_tpu:
    my_project_name = subprocess.check_output(['gcloud','config','get-value','project'])
    my_zone = subprocess.check_output(['gcloud','config','get-value','compute/zone'])
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            tpu=[FLAGS.tpu_name],
            zone=my_zone,
            project=my_project_name)
    master = tpu_cluster_resolver.get_master()
else:
    master = ''

my_tpu_run_config = tf.estimator.tpu.RunConfig(
    master=master, evaluation_master=master, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=True),
    
    tpu_config=tf.estimator.tpu.TPUConfig(FLAGS.iterations, FLAGS.num_shards),
)

#### CHECK CONNECTION -- *TPU only*

In [None]:
if 'TPU_NAME' not in os.environ:
    print('ERROR: NOT CONNECTED TO TPU!')
    
else:
    tpu_address = 'grpc://' + os.environ['TPU_NAME']
    print ('TPU address is', tpu_address)

    with tf.Session(tpu_address) as session:
    devices = session.list_devices()
    
    print('TPU devices:')
    pprint.pprint(devices)

#### ESTIMATOR -- *GPU only*

In [None]:
my_estimator = tf.estimator.Estimator(model_fn=LipCapsule())

#### TPU ESTIMATOR -- *TPU only*

In [None]:
my_tpu_estimator = tf.estimator.tpu.TPUEstimator(
    model_fn=my_model_fn,
    config=tf.estimator.tpu.RunConfig(),
    use_tpu=True)

#### SET GPU -- *GPU only*

In [469]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

#### IGNORE MEMORY ERRORS -- *GPU or TPU*

In [470]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

#### MODEL PARAMETERS -- *GPU or TPU*

In [7]:
class Params(object):
    
    batch_sz = 4    # max value
    num_threads=2   # system limit
    pixels=52       # image h & w
    
    # Dataset
    labels = 10
    data_qt=9000
    val_qt=data_qt/15
    test_qt=data_qt/15
    train_qt=data_qt-test_qt-val_qt
    
    train_sum_freq=100
    val_sum_freq=500
    save_freq=3
    
    # Folders
    logdir='logdir'
    dataset='miracl'
    results='results'
    

#### MARGIN LOSS PARAMETERS -- *GPU or TPU*

In [481]:
m_plus = 0.9
m_minus = 0.1
lambda_val = 0.5

#### TRAINING PARAMETERS -- *GPU or TPU*

In [482]:
epochs = 100
epsilon = 1e-9    
iter_routing = 3
mask_with_y = True
stddev = 0.01
regularization_scale = 0

## LipCapsule Model

#### IMPLEMENT MODEL -- *GPU or TPU*

In [472]:
class LipCapsule(object):

    def __init__(self, is_training=True, height=Params.pixels, width=Params.pixels, channels=1, num_label=10):
        
        batch_sz = Params.batch_sz
        
        """
            height: input height integer
            width: input width integer
            channels: input channels integer
            num_label: number categories integer
        """
        self.height = height
        self.width = width
        self.channels = channels
        self.num_label = num_label

        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.X, self.labels = get_batch_data(dataset, batch_sz, num_threads)
                self.Y = tf.one_hot(self.labels, depth=self.num_label, axis=1, dtype=tf.float32)

                self.build_arch()
                self.loss()
                self._summary()

                # t_vars = tf.trainable_variables()
                self.global_step = tf.Variable(0, name='global_step', trainable=False)
                self.optimizer = tf.train.AdamOptimizer(0.0005) # original was 0.001
                self.train_op = self.optimizer.minimize(self.total_loss, global_step=self.global_step)
            else:
                self.X = tf.placeholder(tf.float32, shape=(batch_sz, self.height, 
                                                           self.width, self.channels))
                self.labels = tf.placeholder(tf.int32, shape=(batch_sz, ))
                self.Y = tf.reshape(self.labels, shape=(batch_sz, self.num_label, 1))
                self.build_arch()

        tf.logging.info('Setting up the main structure')

    def build_arch(self):
        
        batch_sz = Params.batch_sz
        
        with tf.variable_scope('Conv1_layer'):
            # Conv1, return tensor with shape [batch_size, 20, 20, 256]
            conv1 = tf.contrib.layers.conv2d(self.X, num_outputs=256,
                                             kernel_size=9, stride=1,
                                             padding='VALID')

        # Primary Capsules layer returns tensor shaped [batch_size, 1152, 8, 1]
        with tf.variable_scope('PrimaryCaps_layer'):
            primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, 
                                    layer_type='CONV')
            caps1 = primaryCaps(conv1, kernel_size=9, stride=2)

        # DigitCaps layer, return shape [batch_size, 10, 16, 1]
        with tf.variable_scope('DigitCaps_layer'):
            digitCaps = CapsLayer(num_outputs=self.num_label, vec_len=16, 
                                  with_routing=True, layer_type='FC')
            self.caps2 = digitCaps(caps1)

        with tf.variable_scope('Masking'):
            # calc ||v_c|| and softmax(||v_c||)
            # [batch_size, 10, 16, 1] => [batch_size, 10, 1, 1]
            self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2),
                                               axis=2, keepdims=True) + epsilon)
            self.softmax_v = softmax(self.v_length, axis=1)
            # self.softmax_v.get_shape() == [batch_sz, self.num_label, 1, 1]

            # picks out index of max softmax val of 10 caps
            # [batch_size, 10, 1, 1] => [batch_size] (index)
            self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1))
            # self.argmax_idx.get_shape() == [batch_sz, 1, 1]
            self.argmax_idx = tf.reshape(self.argmax_idx, shape=(batch_sz, ))

            # Not Masking
            if not mask_with_y:
                # 4-dimensional indexing process
                masked_v = []
                for batch_size in range(batch_sz):
                    v = self.caps2[batch_size][self.argmax_idx[batch_size], :]
                    masked_v.append(tf.reshape(v, shape=(1, 1, 16, 1)))

                self.masked_v = tf.concat(masked_v, axis=0)
                assert self.masked_v.get_shape() == [batch_sz, 1, 16, 1]
                
            # Masking
            else:
                self.masked_v = tf.multiply(tf.squeeze(self.caps2), tf.reshape(self.Y, (-1, self.num_label, 1)))
                self.v_length = tf.sqrt(reduce_sum(tf.square(self.caps2), axis=2, 
                                                   keepdims=True) + epsilon)

        # MIRACL-VC1 images reconstructed with three (FC) layers
        # [batch_size, 1, 16, 1] => [batch_size, 16] => [batch_size, 512]
        with tf.variable_scope('Decoder'):
            vector_j = tf.reshape(self.masked_v, shape=(batch_sz, -1))
            fc1 = tf.contrib.layers.fully_connected(vector_j, num_outputs=512)
            fc2 = tf.contrib.layers.fully_connected(fc1, num_outputs=1024)
            self.decoded = tf.contrib.layers.fully_connected(
                fc2, num_outputs=self.height * self.width * self.channels, activation_fn=tf.sigmoid)

    def loss(self):
        
        batch_sz = Params.batch_sz
        
        #  Margin loss [batch_size, 10, 1, 1]
        # max_l = max(0, m_plus-||v_c||)^2
        max_l = tf.square(tf.maximum(0., m_plus - self.v_length))
        # max_r = max(0, ||v_c||-m_minus)^2
        max_r = tf.square(tf.maximum(0., self.v_length - m_minus))
        assert max_l.get_shape() == [batch_sz, self.num_label, 1, 1]

        # Reshapes [batch_size, 10, 1, 1] => [batch_size, 10]
        max_l = tf.reshape(max_l, shape=(batch_sz, -1))
        max_r = tf.reshape(max_r, shape=(batch_sz, -1))

        # calc T_c: [batch_size, 10]
        T_c = self.Y    # T_c = Y
        # element-wise multiply [batch_size, 10]
        L_c = T_c * max_l + lambda_val * (1 - T_c) * max_r

        self.margin_loss = tf.reduce_mean(tf.reduce_sum(L_c, axis=1))

        # Reconstruction loss
        orgin = tf.reshape(self.X, shape=(batch_sz, -1))
        squared = tf.square(self.decoded - orgin)
        self.reconstruction_err = tf.reduce_mean(squared)

        # Mean squared error calculated with reduce_mean (reconstruction loss)
        self.total_loss = self.margin_loss + regularization_scale * self.reconstruction_err

    # Summary
    def _summary(self):
        
        batch_sz = Params.batch_sz
        
        train_summary = []
        train_summary.append(tf.summary.scalar('train/margin_loss', self.margin_loss))
        train_summary.append(tf.summary.scalar('train/reconstruction_loss', self.reconstruction_err))
        train_summary.append(tf.summary.scalar('train/total_loss', self.total_loss))
        recon_img = tf.reshape(self.decoded, shape=(batch_sz, self.height, self.width, self.channels))
        train_summary.append(tf.summary.image('reconstruction_img', recon_img))
        self.train_summary = tf.summary.merge(train_summary)

        correct_prediction = tf.equal(tf.to_int32(self.labels), self.argmax_idx)
        self.accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32))

### CapsLayer

#### LAYER CAPSULES -- *GPU or TPU*

In [473]:
class CapsLayer(object):
    ''' Capsule layer.
    Args:
        input: A 4-D tensor.
        num_outputs: the number of capsule in this layer.
        vec_len: integer, the length of the output vector of a capsule.
        layer_type: string, one of 'FC' or "CONV", the type of this layer,
            fully connected or convolution, for the future expansion capability
        with_routing: boolean, this capsule is routing with the lower-level layer capsule.

    Returns:
        A 4-D tensor.
    '''
    
    batch_size = Params.batch_sz
    
    def __init__(self, num_outputs, vec_len, with_routing=True, layer_type='FC'):
        self.num_outputs = num_outputs
        self.vec_len = vec_len
        self.with_routing = with_routing
        self.layer_type = layer_type

    def __call__(self, input, kernel_size=None, stride=None):
        
        # Parameters 'kernel_size' and 'stride' will be used while layer_type = CONV
                
        batch_sz = Params.batch_sz
        
        if self.layer_type == 'CONV':
            self.kernel_size = kernel_size
            self.stride = stride

            if not self.with_routing:
                # the PrimaryCaps layer, a convolutional layer
                # input: [batch_size, 20, 20, 256]
                # assert input.get_shape() == [batch_sz, 20, 20, 256]

                # NOTE: I can't find out any words from the paper whether the
                # PrimaryCap convolution does a ReLU activation or not before
                # squashing function, but experiment show that using ReLU get a
                # higher test accuracy. So, which one to use will be your choice
                capsules = tf.contrib.layers.conv2d(input, self.num_outputs * self.vec_len,
                                                    self.kernel_size, self.stride, padding="VALID",
                                                    activation_fn=tf.nn.relu)
                # capsules = tf.contrib.layers.conv2d(input, self.num_outputs * self.vec_len,
                #                                    self.kernel_size, self.stride,padding="VALID",
                #                                    activation_fn=None)
                capsules = tf.reshape(capsules, (batch_sz, -1, self.vec_len, 1))

                # return tensor with shape [batch_size, 1152, 8, 1]
                capsules = squash(capsules)
                return(capsules)

        if self.layer_type == 'FC':
            if self.with_routing:
                # the DigitCaps layer is fully connected (FC) layer
                # Reshapes to [batch_size, 1152, 1, 8, 1]
                self.input = tf.reshape(input, shape=(batch_sz, -1, 1, input.shape[-2].value, 1))

                with tf.variable_scope('routing'):
                    # b_IJ: [batch_size, num_caps_l, num_caps_l_plus_1, 1, 1],
                    # about the reason of using 'batch_size', see issue #21
                    b_IJ = tf.constant(np.zeros([batch_sz, input.shape[1].value, self.num_outputs, 1, 1], dtype=np.float32))
                    capsules = routing(self.input, b_IJ, num_outputs=self.num_outputs, num_dims=self.vec_len)
                    capsules = tf.squeeze(capsules, axis=1)

            return(capsules)

   #### DYNAMIC ROUTING BETWEEN CAPSULES -- *GPU or TPU*

In [474]:
def routing(input, b_IJ, num_outputs=Params.labels, num_dims=16):
    
    ''' input: Tensor shaped [batch_size, num_caps_l=1152, 1, length(u_i)=8, 1]
        num_caps_l: number of  layer l capsules.
        num_outputs: number of output capsules.
        num_dims: output capsule dimensions.
        v_j: vector of capsule j in layer l+1
        u_i: vector of capsule i in layer l   
        W: [1, num_caps_i, num_caps_j * len_v_j, len_u_j, 1]
     '''
    batch_sz = Params.batch_sz
    
    input_shape = get_shape(input)
    
    W = tf.get_variable('Weight', shape=[1, input_shape[1], num_dims * num_outputs] + input_shape[-2:],
                        dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=stddev))
    
    biases = tf.get_variable('bias', shape=(1, 1, num_outputs, num_dims, 1))

    # Element-wise multiply calculates u_hat and reduce_sum
    # reshape ops reduces time of tf.matmul operation
    
    # Matmul [a, b] x [b, c] = element-wise multiply [a*c, b] * [a*c, b]
    # reduce_sum at axis=1 and reshape to [a, c]
    
    input = tf.tile(input, [1, 1, num_dims * num_outputs, 1, 1])
    
    # input.get_shape() == [batch_size, 1152, 160, 8, 1]
    # u_hat.get_shape() == [batch_size, 1152, 10, 16, 1]
    
    u_hat = reduce_sum(W * input, axis=3, keepdims=True)
    u_hat = tf.reshape(u_hat, shape=[-1, input_shape[1], num_outputs, num_dims, 1])
    

    # u_hat_stopped = u_hat in forward propagation.  
    # Gradient not passed back u_hat_stopped to u_hat in backpropagation
    
    u_hat_stopped = tf.stop_gradient(u_hat, name='stop_gradient')

    for r_iter in range(iter_routing):
        with tf.variable_scope('iter_' + str(r_iter)):

            # => [batch_size, 1152, 10, 1, 1]
            c_IJ = softmax(b_IJ, axis=2)

            # Final iter uses u_hat to receive gradients from graph
            if r_iter == iter_routing - 1:

                # weight u_hat with c_IJ, element-wise dimension => [batch_size, 1152, 10, 16, 1]
                s_J = tf.multiply(c_IJ, u_hat)
                
                # sum second dimension = [batch_size, 1, 10, 16, 1]
                # s_J.get_shape() == [batch_size, 1, num_outputs, num_dims, 1]
                s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases

                # squash v_J.get_shape() == [batch_size, 1, 10, 16, 1]
                v_J = squash(s_J)
                
            # No backpropagation here
            elif r_iter < iter_routing - 1:  
                s_J = tf.multiply(c_IJ, u_hat_stopped)
                s_J = reduce_sum(s_J, axis=1, keepdims=True) + biases
                v_J = squash(s_J)

                # reshape & tile v_j [batch_size ,1, 10, 16, 1] to [batch_size, 1152, 10, 16, 1]
                # matmul in the last two dim: [16, 1].T x [16, 1] => [1, 1], reduce mean in the
                # batch_size dim, resulting in [1, 1152, 10, 1, 1]
                v_J_tiled = tf.tile(v_J, [1, input_shape[1], 1, 1, 1])
                u_produce_v = reduce_sum(u_hat_stopped * v_J_tiled, axis=3, keepdims=True)
                
                # u_produce_v.get_shape() == [batch_size, 1152, 10, 1, 1]
                # b_IJ += tf.reduce_sum(u_produce_v, axis=0, keep_dims=True)
                b_IJ += u_produce_v

    return(v_J)    # Returns: Tensor shaped [batch_size, num_caps_l_plus_1, length(v_j)=16, 1]

#### SQUASH TENSOR -- *GPU or TPU*

In [475]:
def squash(vector):
    
    batch_size = Params.batch_sz
    
    '''
    vector: Tensor shaped [batch_size, 1, num_caps, vec_len, 1] or [batch_size, num_caps, vec_len, 1].
    
    Returns squashed tensor with 'vec_len' dimensions in same shape as vector.
    '''
    vec_squared_norm = reduce_sum(tf.square(vector), -2, keepdims=True)
    scalar_factor = vec_squared_norm / (1 + vec_squared_norm) / tf.sqrt(vec_squared_norm + epsilon)
    vec_squashed = scalar_factor * vector  # element-wise
    
    return(vec_squashed)

#### LOAD DATASET -- *GPU or TPU*

In [476]:
def load_data(batch_size, is_training=True):
    
    # Method Params
    batch_sz = Params.batch_sz
    load_qt = Params.train_qt+Params.val_qt
    path = os.path.join('data', 'miracl')    # path to dataset
    grey = 1
    
    if is_training:
        fd = open(os.path.join(path, 'train-images-idx3-ubyte'))
        loaded = np.fromfile(file=fd, dtype=np.uint8)
        trainX = loaded[16:].reshape((load_qt, Params.pixels, Params.pixels, 1)).astype(np.float32)

        fd = open(os.path.join(path, 'train-labels-idx1-ubyte'))
        loaded = np.fromfile(file=fd, dtype=np.uint8)
        trainY = loaded[8:].reshape((load_qt)).astype(np.int32)

        trX = trainX[:Params.train_qt] / 255.
        trY = trainY[:Params.train_qt] 

        valX = trainX[Params.train_qt, ] / 255.
        valY = trainY[Params.train_qt:]

        num_tr_batch = Params.train_qt // batch_size
        num_val_batch = Params.val_qt // batch_size

        return trX, trY, num_tr_batch, valX, valY, num_val_batch
    else:
        fd = open(os.path.join(path, 't10k-images-idx3-ubyte'))
        loaded = np.fromfile(file=fd, dtype=np.uint8)
        teX = loaded[16:].reshape((Params.test_qt, Params.pixels, Params.pixels, grey)).astype(np.float)

        fd = open(os.path.join(path, 't10k-labels-idx1-ubyte'))
        loaded = np.fromfile(file=fd, dtype=np.uint8)
        teY = loaded[8:].reshape((Params.test_qt)).astype(np.int32)

        num_te_batch = Params.test_qt // batch_size
        return teX / 255., teY, num_te_batch

#### SHUFFLE DATA -- *GPU or TPU*

In [477]:
def get_batch_data(dataset, batch_size, num_threads):
    
    batch_sz = Params.batch_sz  
    
    trX, trY, num_tr_batch, valX, valY, num_val_batch = load_data(batch_size, is_training=True)
    data_queues = tf.train.slice_input_producer([trX, trY])
    
    X, Y = tf.train.shuffle_batch(data_queues, num_threads=num_threads, batch_size=batch_size,
                                  capacity=batch_size * 64, min_after_dequeue=batch_size * 32,
                                  allow_smaller_final_batch=False)

    return(X, Y)

####  MERGE INVERSE TRANSFORM -- *GPU or TPU*

In [478]:
def save_images(imgs, size, path):
    
    batch_sz = Params.batch_sz
    
    '''
    Args:
        imgs: shaped [batch_size, image_height, image_width]
        size: two integers  [image_height, image_width]
        path: path to save data
    '''
    
    imgs = (imgs + 1.) / 2    
    
    return(scipy.misc.imsave(path, mergeImgs(imgs, size)))

def mergeImgs(images, size):
    h, w = images.shape[1], images.shape[2]
    imgs = np.zeros((h * size[0], w * size[1], 3))
    for idx, image in enumerate(images):
        i = idx % size[1]
        j = idx // size[1]
        imgs[j * h:j * h + h, i * w:i * w + w, :] = image

    return imgs

#### SOFTMAX -- *GPU or TPU*

In [479]:
def reduce_sum(input_tensor, axis=None, keepdims=False):
    
    batch_sz = Params.batch_sz
    
    try:
        return tf.reduce_sum(input_tensor, axis=axis, keepdims=keepdims)
    except:
        return tf.reduce_sum(input_tensor, axis=axis, keep_dims=keepdims)    # alt for version compatibility
    
def softmax(logits, axis=None):
    try:
        return tf.nn.softmax(logits, axis=axis)
    except:
        return tf.nn.softmax(logits, dim=axis)    # alt for version compatibility

##### OPTIMIZER -- TPU

In [382]:
updates = tf.contrib.tpu.CrossShardOptimizer(
            tf.train.AdamOptimizer(0.0005))

#### LIST DIMENSIONS -- *GPU or TPU*

In [480]:
def get_shape(inputs, name=None):
    
    batch_sz = Params.batch_sz
    
    name = "shape" if name is None else name
    with tf.name_scope(name):
        static_shape = inputs.get_shape().as_list()
        dynamic_shape = tf.shape(inputs)
        shape = []
        for i, dim in enumerate(static_shape):
            dim = dim if dim is not None else dynamic_shape[i]
            shape.append(dim)
        return(shape)

#### SAVE ACCURACY PROGRESS (VALIDATION, TRAINING AND LOSS) -- *GPU or TPU*

In [483]:
def save_to():
    batch_sz = Params.batch_sz        
    loss = results + '/loss.csv'
    train_acc = results + '/train_acc.csv'
    val_acc = results + '/val_acc.csv'
    fd_train_acc = open(train_acc, 'w')
    fd_train_acc.write('step,train_acc\n')
    fd_loss = open(loss, 'w')
    fd_loss.write('step,loss\n')
    fd_val_acc = open(val_acc, 'w')
    fd_val_acc.write('step,val_acc\n')
    return(fd_train_acc, fd_loss, fd_val_acc)

#### TRAIN METHOD -- *GPU or TPU*

In [484]:
def train(model, supervisor, num_label):
    
    batch_size = Params.batch_sz  
    
    trX, trY, num_tr_batch, valX, valY, num_val_batch = load_data(batch_size, is_training=True)
    Y = valY[:num_val_batch * batch_size].reshape((-1, 1))

    fd_train_acc, fd_loss, fd_val_acc = save_to()
    
    with supervisor.managed_session() as sess:
        for epoch in range(epochs):
            print("\n Training.  Epoch %d/%d:" % (epoch, epochs))
            if supervisor.should_stop():
                print('\n Supervisor stoped! \n ')
                break
            for step in tqdm(range(num_tr_batch), total=num_tr_batch, ncols=50, leave=False, unit='b'):
                start = step * batch_size
                end = start + batch_size
                global_step = epoch * num_tr_batch + step

                if global_step % train_sum_freq == 0:
                    _, loss, train_acc, summary_str = sess.run([model.train_op, model.total_loss, 
                                                                model.accuracy, model.train_summary])
                    assert not np.isnan(loss), 'LOSS NAN ERROR'
                    
                    supervisor.summary_writer.add_summary(summary_str, global_step)

                    fd_loss.write(str(global_step) + ',' + str(loss) + "\n")
                    fd_loss.flush()
                    fd_train_acc.write(str(global_step) + ',' + str(train_acc / batch_size) + "\n")
                    fd_train_acc.flush()
                else:
                    sess.run(model.train_op)

                if val_sum_freq != 0 and (global_step) % val_sum_freq == 0:
                    val_acc = 0
                    for i in range(num_val_batch):
                        start = i * batch_size
                        end = start + batch_size
                        acc = sess.run(model.accuracy, {model.X: valX[start:end], model.labels: valY[start:end]})
                        val_acc += acc
                    val_acc = val_acc / (batch_size * num_val_batch)
                    fd_val_acc.write(str(global_step) + ',' + str(val_acc) + '\n')
                    fd_val_acc.flush()

            if (epoch + 1) % save_freq == 0:
                supervisor.saver.save(sess, logdir + '/model_epoch_%04d_step_%02d' % (epoch, global_step))

        fd_val_acc.close()
        fd_train_acc.close()
        fd_loss.close()

#### SET TO TRAIN -- *GPU or TPU*

In [8]:
is_training=True

#### SET TO TEST -- *GPU or TPU*

In [9]:
is_training=False

#### TEST ACCURACY -- *GPU or TPU*

In [443]:
def evaluation(model, supervisor, num_label):
    
    batch_size = Params.batch_sz
    teX, teY, num_te_batch = load_data(batch_size, is_training=False)
    
    fd_test_acc = save_test()
    
    with supervisor.managed_session() as sess:
        supervisor.saver.restore(sess, tf.train.latest_checkpoint(logdir))
        tf.logging.info(' \n MODEL RESTORED. \n')

        test_acc = 0
        
        for i in tqdm(range(num_te_batch), total=num_te_batch, ncols=70, leave=False, unit='b'):
            start = i * batch_sz
            end = start + batch_sz
            acc = sess.run(model.accuracy, {model.X: teX[start:end], model.labels: teY[start:end]})
            test_acc += acc
            
        test_acc = test_acc / (batch_size * num_te_batch)
        print(str(test_acc))
        fd_test_acc.write(str(test_acc))
        
        fd_test_acc.close()
        
        print('  \n TEST ACCURACY SAVED TO ' + results + '/test_acc.csv  \n ')

#### SAVE TEST ACCURACY -- *GPU or TPU*

In [444]:
def save_test():
    
    test_acc = results + '/test_acc.csv'
    
    # if os.path.exists(test_acc):
    #   os.remove(test_acc)
        
    fd_test_acc = open(test_acc, 'w')
    fd_test_acc.write('test_acc\n')
    
    return(fd_test_acc)

#### MAIN METHOD -- *GPU or TPU*

In [10]:
def main(_):
    
    batch_sz = Params.batch_sz
    num_label = 10
    model = LipCapsule()
    
    tf.logging.info('  \n LOADING GRAPH ...  \n ')
    
    sv = tf.train.Supervisor(graph=model.graph, logdir='logdir', save_model_secs=0)
    
    tf.logging.info(' \n GRAPH LOADED.  \n ') 
    
    if is_training: 
        tf.logging.info(' TRAINING INITATED ...')
        train(model, sv, num_label)
        tf.logging.info('TRAINING COMPLETE.')
        
    else:
        tf.logging.info(' TEST INITATED ...')
        evaluation(model, sv, num_label)
        tf.logging.info('TEST COMPLETE.')

#### RUN -- *GPU or TPU*

In [None]:
if __name__ == "__main__":
    tf.app.run()

INFO:tensorflow:Setting up the main structure


INFO:tensorflow:Setting up the main structure


INFO:tensorflow:  
 LOADING GRAPH ...  
 


INFO:tensorflow:  
 LOADING GRAPH ...  
 


INFO:tensorflow: 
 GRAPH LOADED.  
 


INFO:tensorflow: 
 GRAPH LOADED.  
 


INFO:tensorflow: TRAINING INITATED ...


INFO:tensorflow: TRAINING INITATED ...


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Starting standard services.


INFO:tensorflow:Starting standard services.


INFO:tensorflow:Starting queue runners.


INFO:tensorflow:Starting queue runners.



 Training.  Epoch 0/100:


  0%|                     | 0/1300 [00:00<?, ?b/s]

INFO:tensorflow:Recording summary at step 0.


INFO:tensorflow:Recording summary at step 0.
 26%|██▊        | 338/1300 [01:59<05:41,  2.82b/s]

INFO:tensorflow:Recording summary at step 338.


INFO:tensorflow:Recording summary at step 338.
 58%|██████▍    | 757/1300 [03:59<02:52,  3.15b/s]

INFO:tensorflow:Recording summary at step 757.


INFO:tensorflow:Recording summary at step 757.
 90%|█████████ | 1175/1300 [06:00<00:38,  3.26b/s]

INFO:tensorflow:Recording summary at step 1175.


INFO:tensorflow:Recording summary at step 1175.
                                                  


 Training.  Epoch 1/100:


 22%|██▍        | 292/1300 [01:26<04:59,  3.37b/s]

INFO:tensorflow:Recording summary at step 1592.


INFO:tensorflow:Recording summary at step 1592.
 55%|██████     | 710/1300 [03:26<02:51,  3.43b/s]

INFO:tensorflow:Recording summary at step 2010.


INFO:tensorflow:Recording summary at step 2010.
 90%|████████▉ | 1164/1300 [05:26<00:38,  3.56b/s]

INFO:tensorflow:Recording summary at step 2464.


INFO:tensorflow:Recording summary at step 2464.
                                                  


 Training.  Epoch 2/100:


 22%|██▍        | 282/1300 [01:14<04:28,  3.79b/s]

INFO:tensorflow:Recording summary at step 2882.


INFO:tensorflow:Recording summary at step 2882.
 54%|█████▉     | 699/1300 [03:14<02:47,  3.60b/s]

INFO:tensorflow:Recording summary at step 3299.


INFO:tensorflow:Recording summary at step 3299.
 86%|████████▌ | 1118/1300 [05:14<00:51,  3.55b/s]

INFO:tensorflow:Recording summary at step 3718.


INFO:tensorflow:Recording summary at step 3718.
                                                  

'Tensor' object has no attribute 'to_proto'


'Tensor' object has no attribute 'to_proto'



 Training.  Epoch 3/100:


 17%|█▊         | 220/1300 [01:07<05:31,  3.26b/s]

INFO:tensorflow:Recording summary at step 4120.


INFO:tensorflow:Recording summary at step 4120.
 49%|█████▍     | 642/1300 [03:07<03:12,  3.42b/s]

INFO:tensorflow:Recording summary at step 4542.


INFO:tensorflow:Recording summary at step 4542.
 84%|████████▎ | 1086/1300 [05:04<01:00,  3.56b/s]