# Import

In [1]:
import os
import numpy as np
import tensorflow as tf
import time


from matplotlib import pyplot as plt
%matplotlib inline

if 'COLAB_TPU_ADDR' not in os.environ:
    print('ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!')
else:
    tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
    print ('TPU address is', tpu_address)

    with tf.Session(tpu_address) as session:
        devices = session.list_devices()

    print('TPU devices:')
    for d in devices:
        print(d)

TPU address is grpc://10.73.80.10:8470
TPU devices:
_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 8547561617843128079)
_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 15808477494867059848)
_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 12623579180533311753)
_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 1913984400030128670)
_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 17589331628087456959)
_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 11177981104279709949)
_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 1820746118815596828)
_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 8848899767820247928)
_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 1444162556629070006)
_DeviceAt

# Connect

In [2]:
import json
bucket = 'kurnianggoro ' #@param {type:"string"}

assert bucket, 'Must specify an existing GCS bucket name'
print('Using bucket: {}'.format(bucket))


from google.colab import auth
auth.authenticate_user()

TF_MASTER = 'grpc://{}'.format(os.environ['COLAB_TPU_ADDR'])

# Upload credentials to TPU.
with tf.Session(TF_MASTER) as sess:    
    with open('/content/adc.json', 'r') as f:
        auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(sess, credentials=auth_info)
    # Now credentials are set for all future sessions on this TPU.


Using bucket: kurnianggoro 


W0710 03:48:26.125220 139906365589376 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



# Dataset

## augmentation

In [0]:
def data_augmentation(image, size=112):

    with tf.name_scope('DataAugmentation'):
        image = tf.cast(image, dtype=tf.float32)
        image = tf.image.random_flip_left_right(image)
        image = tf.image.resize_images(image,[size,size])
                
    return image



## Parse

In [0]:
def parser_fn(example):
    features_format = {
        'image': tf.FixedLenFeature([], dtype= tf.string),
        'width': tf.FixedLenFeature([], dtype= tf.int64),
        'height': tf.FixedLenFeature([], dtype= tf.int64),
        'label': tf.FixedLenFeature([], dtype=tf.int64),
    }
    features = tf.parse_single_example(example, features_format)
    image = tf.image.decode_image(features['image'], channels=3)
    
    image = data_augmentation(image)
    return image, features['label']

def input_pipeline(params):
    data_path = params['data_path']
    batch_size = params['batch_size']
    data_format ='channels_last'
    
    data_files = tf.data.Dataset.list_files(data_path)
        

    dataset = tf.data.TFRecordDataset(data_files, num_parallel_reads=8)
    dataset = dataset.shuffle(16000)
    dataset = dataset.repeat().cache()
    dataset = dataset.map(parser_fn,num_parallel_calls=16)
    dataset = dataset.batch(batch_size)
        
    iterator = dataset.make_one_shot_iterator()
    
    features = iterator.get_next()
    image = features[0]
    label = features[1]
    
    image_size = params['train_image_size']
    
    if data_format=='channels_first':
        image.set_shape([batch_size,3,image_size,image_size])
    else:
        image.set_shape([batch_size,image_size,image_size, 3])
    label.set_shape([batch_size])
    
    return image,label
    

# Base Network

# VGG 16

In [0]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.slim.nets as nets

import numpy as np

slim = tf.contrib.slim

def vgg_arg_scope(weight_decay=0.0005):
    """Defines the VGG arg scope.
    Args:
    weight_decay: The l2 regularization coefficient.
    Returns:
    An arg_scope.
    """
    with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      activation_fn=tf.nn.relu,
                      weights_regularizer=slim.l2_regularizer(weight_decay),
                      biases_initializer=tf.zeros_initializer()):
        with slim.arg_scope([slim.conv2d,slim.max_pool2d], padding='SAME') as arg_sc:
            return arg_sc

def vgg_16(inputs,
           num_classes=1000,
           is_training=True,
           dropout_keep_prob=0.5,
           spatial_squeeze=True,
           scope='vgg_16',
           fc_conv_padding='VALID',
           global_pool=False,
           reuse = False,
           final_endpoint='fc8'):

    end_points = {}
    def add_and_check_final(name, net, collection):
        end_points = slim.utils.convert_collection_to_dict(collection)
        return name == final_endpoint

    with tf.variable_scope(scope, 'vgg_16', [inputs], reuse=reuse) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        # Collect outputs for conv2d, fully_connected and max_pool2d.
        with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
                            outputs_collections=end_points_collection):
            net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
            if ('conv1'==final_endpoint): 
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            net = slim.max_pool2d(net, [2, 2], scope='pool1')
            if ('pool1'==final_endpoint): 
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
            if ('conv2'==final_endpoint):
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            net = slim.max_pool2d(net, [2, 2], scope='pool2')
            if ('pool2'==final_endpoint):
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
            if ('conv3'==final_endpoint): 
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            net = slim.max_pool2d(net, [2, 2], scope='pool3')
            if ('pool3'==final_endpoint): 
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
            if ('conv4'==final_endpoint): 
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            net = slim.max_pool2d(net, [2, 2], scope='pool4')
            if ('pool4'==final_endpoint):
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
            if ('conv5'==final_endpoint): 
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            net = slim.max_pool2d(net, [2, 2], stride=2,scope='pool5')
            if ('pool5'==final_endpoint): 
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            # Use conv2d instead of fully_connected layers.
            net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
            if ('fc6'==final_endpoint):
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
                             scope='dropout6')
            net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
            if ('fc7'==final_endpoint): 
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                return net, end_points
            
            # Convert end_points_collection into a end_point dict.
            end_points = slim.utils.convert_collection_to_dict(end_points_collection)
            if global_pool:
                net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool')
                end_points['global_pool'] = net
            if num_classes:
                net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
                                   scope='dropout7')
                net = slim.conv2d(net, num_classes, [1, 1],
                                  activation_fn=None,
                                  normalizer_fn=None,
                                  scope='fc8')
                if spatial_squeeze:
                    net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
                end_points[sc.name + '/fc8'] = net
                
            return net, end_points

vgg_16.default_image_size = 224
vgg_16.R_MEAN = 123.
vgg_16.G_MEAN = 117.
vgg_16.B_MEAN = 104.

#Model

##Initializer

In [0]:
def basenet_initializer(pretrained_vars, scope_in_model, scope_in_ckpt, checkpoint_path):
    #make the pairs of var_names and vars, var_names must match the names in ckpt
    variables_to_restore = {var.op.name.replace(scope_in_model, scope_in_ckpt): var for var in pretrained_vars}

    loader = tf.train.Saver(variables_to_restore)
    loader.build()       

    def callback(scaffold, session):
        loader.restore(session, checkpoint_path)
        
    return callback 

# Base Model

In [0]:
def model(inputs, n_class, reuse=tf.AUTO_REUSE):
    start_vars = set(x.name for x in tf.global_variables())
    with slim.arg_scope(vgg_arg_scope()):
        net, vgg_layers = vgg_16(inputs, 1000, is_training=True, reuse=reuse, final_endpoint='pool5')
    end_vars = tf.global_variables()
    basenet_vars = [x for x in end_vars if x.name not in start_vars]
    
    emb = tf.layers.conv2d(net, 512,(4,4), activation=tf.nn.relu, name='embeddings')
    logits = tf.layers.conv2d(emb, n_class,(1,1), activation=tf.nn.relu)
    logits = tf.reshape(logits, [-1,n_class], name='logits')
    
    return logits, basenet_vars
    

## TPU Model

In [0]:
def tpu_model(features, labels, mode, params):
    with tf.variable_scope(params['model_scope'], reuse=tf.AUTO_REUSE):
        logits, basenet_vars = model(features, params['num_classes'], reuse=tf.AUTO_REUSE)

    predictions = tf.argmax(logits, axis=-1)
    
    print('-'*50)
    for v in tf.global_variables():
        print(v)
    print('-'*50)
    
    with tf.name_scope('Losses'):
        with tf.name_scope('softmax_loss'):
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
            softmax_loss = tf.reduce_mean(loss)

        l2_loss_vars = []
        with tf.name_scope('regularizer'):
            for trainable_var in tf.trainable_variables():
                l2_loss_vars.append(tf.nn.l2_loss(trainable_var))
        l2_loss = tf.multiply(params['weight_decay'], tf.add_n(l2_loss_vars), name='l2_loss')
        
        with tf.name_scope('total_loss'):
            total_loss = tf.add(softmax_loss, l2_loss, name='total_loss')
    
    with tf.name_scope('Optimizer'):
        global_step = tf.train.get_or_create_global_step()
        
        optimizer = tf.train.MomentumOptimizer(learning_rate=params['learning_rate'], momentum=params['momentum'])

        if params['use_tpu']:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer, tf.losses.Reduction.MEAN)

        train_op = optimizer.minimize(total_loss, global_step)   
    
    def scaffold_fn():    
        return tf.train.Scaffold(init_fn=basenet_initializer(basenet_vars, flags['model_scope']+'/vgg_16', 'vgg_16', params['checkpoint_path']))
    
    return tf.contrib.tpu.TPUEstimatorSpec(
                              mode=mode,
                              predictions=predictions,
                              loss=total_loss,
                              train_op=train_op,
                              scaffold_fn=scaffold_fn
                    )

# Training

In [15]:
tf.reset_default_graph()
tf.logging.set_verbosity(tf.logging.INFO)

flags = {}
flags['train_image_size'] = 112
flags['data_path'] = 'gs://kurnianggoro/casia112_tfrecord/train-*'
flags['model_dir'] = 'gs://kurnianggoro/training/testvgg/'
flags['save_checkpoints_secs'] = 60
flags['save_summary_steps'] = 500
flags['tf_random_seed'] = 54334
flags['log_every_n_steps'] = 100

flags['batch_size_base'] = 512

flags['model_scope'] = 'VGGmodel'
flags['weight_decay'] = 5e-4
flags['momentum'] = 0.9
flags['learning_rate'] = 1e-3
flags['checkpoint_path'] = 'gs://kurnianggoro/vgg16_standard/vgg_16.ckpt'
flags['num_classes'] = 10572

flags['train_image_size'] = 112
flags['max_number_of_steps'] = 30000

for k,v in flags.items():
    print(k,v)

config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, 
                        #intra_op_parallelism_threads=flags['num_cpu_threads'], 
                        #inter_op_parallelism_threads=flags['num_cpu_threads'], 
                        )


tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(tpu_address,)

# Set up a RunConfig to only save checkpoints once per training cycle.
run_config =  tf.contrib.tpu.RunConfig(cluster=tpu_cluster_resolver,model_dir=flags['model_dir']).replace(
                                    save_checkpoints_secs=flags['save_checkpoints_secs']).replace(
                                    save_checkpoints_steps=None).replace(
                                    save_summary_steps=flags['save_summary_steps']).replace(
                                    keep_checkpoint_max=5).replace(
                                    tf_random_seed=flags['tf_random_seed']).replace(
                                    log_step_count_steps=flags['log_every_n_steps']).replace(
                                    session_config=config,
                                    tpu_config=tf.contrib.tpu.TPUConfig(100, 8)
                                    )

trainer = tf.contrib.tpu.TPUEstimator(
    model_fn=tpu_model, use_tpu=True, model_dir=flags['model_dir'], config=run_config,train_batch_size=flags['batch_size_base'],
    params={
        'model_scope': flags['model_scope'],
        'num_classes': flags['num_classes'],
        'weight_decay': flags['weight_decay'],
        'momentum': flags['momentum'],
        'learning_rate': flags['learning_rate'],
        'checkpoint_path': flags['checkpoint_path'],
        'data_path': flags['data_path'],
        'is_training': True,
        'train_image_size': flags['train_image_size']
        
    })


print('Starting a training cycle.')
trainer.train(input_fn=input_pipeline,
                   max_steps=flags['max_number_of_steps'],
#                    steps=1000
                  )

train_image_size 112
data_path gs://kurnianggoro/casia112_tfrecord/train-*
model_dir gs://kurnianggoro/training/testvgg/
save_checkpoints_secs 60
save_summary_steps 500
tf_random_seed 54334
log_every_n_steps 100
batch_size_base 512
model_scope VGGmodel
weight_decay 0.0005
momentum 0.9
learning_rate 0.001
checkpoint_path gs://kurnianggoro/vgg16_standard/vgg_16.ckpt
num_classes 10572
max_number_of_steps 30000


I0710 03:50:53.822002 139906365589376 estimator.py:209] Using config: {'_model_dir': 'gs://kurnianggoro/training/testvgg/', '_tf_random_seed': 54334, '_save_summary_steps': 500, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 60, '_session_config': allow_soft_placement: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3e32a24748>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.73.80.10:8470', '_evaluation_master': 'grpc://10.73.80.10:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=8, num_cores_per_replica=None, per_host_inpu

Starting a training cycle.


I0710 03:50:54.488768 139906365589376 tpu_system_metadata.py:78] Querying Tensorflow master (grpc://10.73.80.10:8470) for TPU system metadata.
I0710 03:50:54.497261 139906365589376 tpu_system_metadata.py:148] Found TPU system:
I0710 03:50:54.498866 139906365589376 tpu_system_metadata.py:149] *** Num TPU Cores: 8
I0710 03:50:54.505394 139906365589376 tpu_system_metadata.py:150] *** Num TPU Workers: 1
I0710 03:50:54.506703 139906365589376 tpu_system_metadata.py:152] *** Num TPU Cores Per Worker: 8
I0710 03:50:54.507949 139906365589376 tpu_system_metadata.py:154] *** Available Device: _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 8547561617843128079)
I0710 03:50:54.510479 139906365589376 tpu_system_metadata.py:154] *** Available Device: _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 12623579180533311753)
I0710 03:50:54.512295 139906365589376 tpu_system_metadata.py:154] *** Available Device: _DeviceAttributes(/job:tpu_worker/r

--------------------------------------------------
<tf.Variable 'global_step:0' shape=() dtype=int64>
<tf.Variable 'VGGmodel/vgg_16/conv1/conv1_1/weights:0' shape=(3, 3, 3, 64) dtype=float32>
<tf.Variable 'VGGmodel/vgg_16/conv1/conv1_1/biases:0' shape=(64,) dtype=float32>
<tf.Variable 'VGGmodel/vgg_16/conv1/conv1_2/weights:0' shape=(3, 3, 64, 64) dtype=float32>
<tf.Variable 'VGGmodel/vgg_16/conv1/conv1_2/biases:0' shape=(64,) dtype=float32>
<tf.Variable 'VGGmodel/vgg_16/conv2/conv2_1/weights:0' shape=(3, 3, 64, 128) dtype=float32>
<tf.Variable 'VGGmodel/vgg_16/conv2/conv2_1/biases:0' shape=(128,) dtype=float32>
<tf.Variable 'VGGmodel/vgg_16/conv2/conv2_2/weights:0' shape=(3, 3, 128, 128) dtype=float32>
<tf.Variable 'VGGmodel/vgg_16/conv2/conv2_2/biases:0' shape=(128,) dtype=float32>
<tf.Variable 'VGGmodel/vgg_16/conv3/conv3_1/weights:0' shape=(3, 3, 128, 256) dtype=float32>
<tf.Variable 'VGGmodel/vgg_16/conv3/conv3_1/biases:0' shape=(256,) dtype=float32>
<tf.Variable 'VGGmodel/vgg_16/c

I0710 03:50:57.291239 139906365589376 basic_session_run_hooks.py:541] Create CheckpointSaverHook.
I0710 03:50:57.356161 139906365589376 estimator.py:1147] Done calling model_fn.
I0710 03:50:57.357641 139906365589376 tpu_estimator.py:499] TPU job name tpu_worker
I0710 03:50:57.596233 139906365589376 monitored_session.py:240] Graph was finalized.
I0710 03:50:57.647992 139906365589376 saver.py:1280] Restoring parameters from gs://kurnianggoro/training/testvgg/model.ckpt-0
W0710 03:51:00.168092 139906365589376 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1066: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.
I0710 03:51:00.406972 139906365589376 session_manager.py:500] Running local_init_op.
I0710 03:51:00.497389 139906365589376 session_manager.py:502] Done running local_init_op.
I071

<tensorflow_estimator.python.estimator.tpu.tpu_estimator.TPUEstimator at 0x7f3e32a24048>

#Dataset pipeline without tfrecord

In [0]:
dataset = tf.keras.datasets.cifar10

(x_train, y_train),(x_test, y_test) = dataset.load_data()
x_train, x_test = x_train / 255.0-0.5, x_test / 255.0-0.5

x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)

def parser_fn(example):
    image = example['image']
    label = example['label']
    
    image = tf.cast(image, dtype=tf.float32)
    image = tf.reshape(image, [32,32,3])

    image = tf.image.resize_image_with_crop_or_pad(image, 40,40)
    image = tf.image.random_crop(image, [32,32,3])
    image = tf.reshape(image, [32,32,3])

        
    return image, tf.cast(label, tf.int64)

def input_pipeline(params):
    batch_size=10
    train=True
    
    x_data = tf.convert_to_tensor(x_train, name='x_data')
    y_data = tf.convert_to_tensor(y_train, name='y_data')

    dataset = tf.data.Dataset.from_tensor_slices({'image':x_data, 'label':y_data})

    dataset = dataset.shuffle(buffer_size=128)
    dataset = dataset.cache()
    if train:
        dataset = dataset.repeat()
    dataset = dataset.map(parser_fn,num_parallel_calls=8)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.make_one_shot_iterator()
    features, labels = dataset.get_next()

    return features, labels