In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import tensorflow_hub as hub

import numpy as np
import os
from tqdm import tqdm

In [2]:
import tensorflow_datasets as tfds

In [3]:
splits = ['train[:80%]', 'train[80%:90%]', 'train[90%:]']

(train_examples, validation_examples, test_examples), info = tfds.load('oxford_flowers102', with_info=True, as_supervised=True, 
                                                                       split = splits, data_dir='data/')

num_examples = info.splits['train'].num_examples
num_classes = info.features['label'].num_classes

[1mDownloading and preparing dataset oxford_flowers102/2.1.1 (download: 328.90 MiB, generated: 331.34 MiB, total: 660.25 MiB) to data/oxford_flowers102/2.1.1...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…









HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to data/oxford_flowers102/2.1.1.incompleteB18GG9/oxford_flowers102-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1020.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to data/oxford_flowers102/2.1.1.incompleteB18GG9/oxford_flowers102-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=6149.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to data/oxford_flowers102/2.1.1.incompleteB18GG9/oxford_flowers102-validation.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1020.0), HTML(value='')))

[1mDataset oxford_flowers102 downloaded and prepared to data/oxford_flowers102/2.1.1. Subsequent calls will reuse this data.[0m


In [4]:
strategy = tf.distribute.MirroredStrategy()





INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [5]:
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

Number of devices: 1


In [6]:
# input pipeline
BUFFER_SIZE = num_examples
EPOCHS = 10
pixels = 224
MODULE_HANDLE = 'data/resnet_50_feature_vector'
IMAGE_SIZE = (pixels, pixels)
print("Using {} with input size {}".format(MODULE_HANDLE, IMAGE_SIZE))

Using data/resnet_50_feature_vector with input size (224, 224)


In [7]:
def format_image(image, label):
    image = tf.image.resize(image, IMAGE_SIZE)/255.0
    return  image, label

In [8]:
def set_global_batch_size(batch_size_per_replica, strategy):
    '''
    Args:
        batch_size_per_replica (int) - batch size per replica
        strategy (tf.distribute.Strategy) - distribution strategy
    '''    
    global_batch_size = batch_size_per_replica * strategy.num_replicas_in_sync    
    return global_batch_size

In [9]:
BATCH_SIZE_PER_REPLICA = 64
GLOBAL_BATCH_SIZE = set_global_batch_size(BATCH_SIZE_PER_REPLICA, strategy)

print(GLOBAL_BATCH_SIZE)

64


In [10]:
train_batches = train_examples.shuffle(num_examples//4).map(format_image).batch(GLOBAL_BATCH_SIZE).prefetch(1)
validation_batches = validation_examples.map(format_image).batch(GLOBAL_BATCH_SIZE).prefetch(1)
test_batches = test_examples.map(format_image).batch(1)

In [11]:
def distribute_datasets(strategy, train_batches, validation_batches, test_batches):  
    train_dist_dataset = strategy.experimental_distribute_dataset(train_batches)
    val_dist_dataset = strategy.experimental_distribute_dataset(validation_batches)
    test_dist_dataset = strategy.experimental_distribute_dataset(test_batches)    
    return train_dist_dataset, val_dist_dataset, test_dist_dataset

In [12]:
train_dist_dataset, val_dist_dataset, test_dist_dataset = distribute_datasets(strategy, train_batches, validation_batches, test_batches)

In [13]:
print(type(train_dist_dataset))
print(type(val_dist_dataset))
print(type(test_dist_dataset))

<class 'tensorflow.python.distribute.input_lib.DistributedDataset'>
<class 'tensorflow.python.distribute.input_lib.DistributedDataset'>
<class 'tensorflow.python.distribute.input_lib.DistributedDataset'>


In [14]:
x = iter(train_dist_dataset).get_next()
    
print(f"x is a tuple that contains {len(x)} values ")
print(f"x[0] contains the features, and has shape {x[0].shape}")
print(f"  so it has {x[0].shape[0]} examples in the batch, each is an image that is {x[0].shape[1:]}")
print(f"x[1] contains the labels, and has shape {x[1].shape}")

x is a tuple that contains 2 values 
x[0] contains the features, and has shape (64, 224, 224, 3)
  so it has 64 examples in the batch, each is an image that is (224, 224, 3)
x[1] contains the labels, and has shape (64,)


In [15]:
# Model
MODULE_HANDLE = "https://tfhub.dev/tensorflow/resnet_50/feature_vector/1"

class ResNetModel(tf.keras.Model):
    def __init__(self, classes):
        super().__init__()
        self._feature_extractor = hub.KerasLayer(MODULE_HANDLE, trainable=False)
        self.dropout = tf.keras.layers.Dropout(0.5) 
        self._classifier = tf.keras.layers.Dense(classes, activation='softmax')

    def call(self, inputs):
        x = self._feature_extractor(inputs)
        x = self.dropout(x)
        x = self._classifier(x)
        return x

In [16]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

In [17]:
with strategy.scope():
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    # or loss_fn = tf.keras.losses.sparse_categorical_crossentropy
    def compute_loss(labels, predictions):
        per_example_loss = loss_object(labels, predictions)
        return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

    test_loss = tf.keras.metrics.Mean(name='test_loss')

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


In [18]:
with strategy.scope():
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')
    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='test_accuracy')

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


In [19]:
with strategy.scope():
    model = ResNetModel(classes=num_classes)
    optimizer = tf.keras.optimizers.Adam()
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

In [20]:
# Training loop
def train_test_step_fns(strategy, model, compute_loss, optimizer, train_accuracy, loss_object, test_loss, test_accuracy):
    with strategy.scope():
        def train_step(inputs):
            images, labels = inputs

            with tf.GradientTape() as tape:                
                predictions = model(images)
                loss = compute_loss(labels, predictions)                

            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            train_accuracy.update_state(labels, predictions)
            return loss 

        def test_step(inputs):
            images, labels = inputs            
            
            predictions = model(images, training=False)
            t_loss = loss_object(labels, predictions)           

            test_loss.update_state(t_loss)
            test_accuracy.update_state(labels, predictions)
        
        return train_step, test_step

In [21]:
train_step, test_step = train_test_step_fns(strategy, model, compute_loss, optimizer, train_accuracy, loss_object, test_loss, test_accuracy)

In [22]:
# Distributed training and testing

def fun1(args=()):
    print(f"number of arguments passed is {len(args)}")
    
    
list_of_inputs = [1,2]
print("When passing in args=list_of_inputs:")
fun1(args=list_of_inputs)
print()
print("When passing in args=(list_of_inputs)")
fun1(args=(list_of_inputs))
print()
print("When passing in args=(list_of_inputs,)")
fun1(args=(list_of_inputs,))

When passing in args=list_of_inputs:
number of arguments passed is 2

When passing in args=(list_of_inputs)
number of arguments passed is 2

When passing in args=(list_of_inputs,)
number of arguments passed is 1


In [23]:
def distributed_train_test_step_fns(strategy, train_step, test_step, model, compute_loss, optimizer, train_accuracy, loss_object, test_loss, test_accuracy):
    with strategy.scope():
        @tf.function
        def distributed_train_step(dataset_inputs):            
            per_replica_losses = strategy.run(train_step, args = (dataset_inputs,))            
            return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                                   axis=None)

        @tf.function
        def distributed_test_step(dataset_inputs):            
            strategy.run(test_step, args=(dataset_inputs,))            
    
        return distributed_train_step, distributed_test_step

In [24]:
distributed_train_step, distributed_test_step = distributed_train_test_step_fns(strategy, train_step, test_step, model, compute_loss, optimizer, train_accuracy, loss_object, test_loss, test_accuracy)

In [25]:
with strategy.scope():
    for epoch in range(EPOCHS):
        # TRAIN LOOP
        total_loss = 0.0 #epoch loss.
        num_batches = 0 #steps
        for x in tqdm(train_dist_dataset): #display a progress bar.
            total_loss += distributed_train_step(x)
            num_batches += 1
        train_loss = total_loss/num_batches #average loss on all batches in this epoch.

        # TEST LOOP
        for x in test_dist_dataset:
            distributed_test_step(x)

        template = ("Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, "
                    "Test Accuracy: {}")
        print(template.format(epoch+1, train_loss,
                               train_accuracy.result()*100, test_loss.result(),
                               test_accuracy.result()*100))

        test_loss.reset_states()
        train_accuracy.reset_states()
        test_accuracy.reset_states()

13it [00:45,  3.52s/it]
0it [00:00, ?it/s]

Epoch 1, Loss: 4.580155372619629, Accuracy: 4.901960849761963, Test Loss: 3.898120164871216, Test Accuracy: 13.72549057006836


13it [00:02,  4.56it/s]
0it [00:00, ?it/s]

Epoch 2, Loss: 2.5349416732788086, Accuracy: 51.960784912109375, Test Loss: 2.782794952392578, Test Accuracy: 47.05882263183594


13it [00:02,  4.58it/s]
0it [00:00, ?it/s]

Epoch 3, Loss: 1.4154744148254395, Accuracy: 85.04901885986328, Test Loss: 2.1478111743927, Test Accuracy: 59.80392074584961


13it [00:02,  4.55it/s]
0it [00:00, ?it/s]

Epoch 4, Loss: 0.8259691596031189, Accuracy: 95.22058868408203, Test Loss: 1.8255575895309448, Test Accuracy: 61.764705657958984


13it [00:02,  4.55it/s]
0it [00:00, ?it/s]

Epoch 5, Loss: 0.5412363409996033, Accuracy: 97.91667175292969, Test Loss: 1.6160001754760742, Test Accuracy: 65.68627166748047


13it [00:02,  4.51it/s]
0it [00:00, ?it/s]

Epoch 6, Loss: 0.37563830614089966, Accuracy: 98.7745132446289, Test Loss: 1.5011917352676392, Test Accuracy: 67.64705657958984


13it [00:02,  4.50it/s]
0it [00:00, ?it/s]

Epoch 7, Loss: 0.2796125113964081, Accuracy: 99.38725280761719, Test Loss: 1.4133518934249878, Test Accuracy: 67.64705657958984


13it [00:05,  2.52it/s]
0it [00:00, ?it/s]

Epoch 8, Loss: 0.21771200001239777, Accuracy: 99.63235473632812, Test Loss: 1.3702000379562378, Test Accuracy: 69.60784149169922


13it [00:02,  4.43it/s]
0it [00:00, ?it/s]

Epoch 9, Loss: 0.17278926074504852, Accuracy: 99.75489807128906, Test Loss: 1.3208794593811035, Test Accuracy: 70.5882339477539


13it [00:02,  4.38it/s]


Epoch 10, Loss: 0.14192533493041992, Accuracy: 99.87745666503906, Test Loss: 1.279017448425293, Test Accuracy: 71.5686264038086


In [26]:
#model_save_path = "./tmp/mymodel/1/"
#tf.saved_model.save(model, model_save_path)

In [27]:
# import os
# import zipfile

# def zipdir(path, ziph):
#     # ziph is zipfile handle
#     for root, dirs, files in os.walk(path):
#         for file in files:
#             ziph.write(os.path.join(root, file))

# zipf = zipfile.ZipFile('./mymodel.zip', 'w', zipfile.ZIP_DEFLATED)
# zipdir('./tmp/mymodel/1/', zipf)
# zipf.close()

# from google.colab import files
# files.download("mymodel.zip")