<a href="https://colab.research.google.com/github/harry9879/Layers-weights-and-bias/blob/main/Distributed_training_in_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Explained by harry
import tensorflow_datasets as tfds
import tensorflow as tf
import os

#Load the TensorBoard notebook extension for visualizing
%load_ext tensorboard

In [2]:
datasets, info = tfds.load(name = "mnist", with_info=True, as_supervised = True) 
#as_supervised = True- return a tuple value ex:(2,3) 
#with_info - return additional information
mnist_train, mnist_test = datasets["train"], datasets["test"]

Downloading and preparing dataset 11.06 MiB (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /root/tensorflow_datasets/mnist/3.0.1...


Dl Completed...:   0%|          | 0/5 [00:00<?, ? file/s]

Dataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.


In [3]:
strategy = tf.distribute.MirroredStrategy()
# the above strategy is used to distribute the system on the Basis of GPU 

In [4]:
print("Number of devices: {}". format(strategy.num_replicas_in_sync))
# to get the num of the device avaliable

Number of devices: 1


#Set up the input pipeline

In [5]:
# You can also do info.splits.total_num_examples to get the total 
#number of examples in the dataset

num_train_examples = info.splits["train"].num_examples
num_test_examples = info.splits["test"].num_examples

BUFFER_SIZE = 10000

BATCH_SIZE_PER_REPLICA = 64
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

In [6]:
# Rescale by divided the max_value 255 to get b/w [0,1]
def scale(image, label):
  image = tf.cast(image, tf.float32)
  image /= 255
  return image, label

In [17]:
train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
# .map(scale) to apply the rescaling
# .cache is for storing some computation on the device
# .shuffle(BUFFER_SIZE) to shuffle or compute from the give buffer size = 10000
eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)

In [18]:
with strategy.scope(): #To the device that the system is distributed system
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape = (28,28,1)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation = 'relu'),
      tf.keras.layers.Dense(10)
  ])

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [19]:
#Define the checkpoint directory to store the checkpoints
checkpoint_dir = "./training_checkpoints"
#Define the name of the checkpoint files.
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
print(checkpoint_prefix)

./training_checkpoints/ckpt_{epoch}


In [20]:
#Define a function for decaying in the learning rate
#You can define any decay function you need
def decay(epoch):
  if epoch < 3:
    return 1e-3
  elif epoch>= 3 and epoch < 7:
    return 1e-4
  else :
    return 1e-5

In [None]:
#Define a callback for printing the learning rate at the end of each epoch
class PrintLR(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs = None):
    print("\nLearning rate for epoch {} is {}".format(       epoch +1, model.optimizer.lr.numpy()))

In [None]:
#Put all the callbacks together 
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir = "./logs"),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                       save_weights_only = True),
    tf.keras.callbacks.LearningRateScheduler(decay),
    PrintLR()
]

In [None]:
EPOCHS = 12
model.fit(train_dataset, epochs=EPOCHS, callbacks=callbacks)

Epoch 1/12
  1/938 [..............................] - ETA: 1:20 - loss: 0.0483 - accuracy: 0.9844



Learning rate for epoch 1 is 0.0010000000474974513
Epoch 2/12
Learning rate for epoch 2 is 0.0010000000474974513
Epoch 3/12
Learning rate for epoch 3 is 0.0010000000474974513
Epoch 4/12
Learning rate for epoch 4 is 9.999999747378752e-05
Epoch 5/12
Learning rate for epoch 5 is 9.999999747378752e-05
Epoch 6/12
Learning rate for epoch 6 is 9.999999747378752e-05
Epoch 7/12
Learning rate for epoch 7 is 9.999999747378752e-05
Epoch 8/12
Learning rate for epoch 8 is 9.999999747378752e-06
Epoch 9/12
Learning rate for epoch 9 is 9.999999747378752e-06
Epoch 10/12
Learning rate for epoch 10 is 9.999999747378752e-06
Epoch 11/12
Learning rate for epoch 11 is 9.999999747378752e-06
Epoch 12/12
Learning rate for epoch 12 is 9.999999747378752e-06


<keras.callbacks.History at 0x7fcc27458160>

In [None]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

eval_loss , eval_acc = model.evaluate(eval_dataset)

print("Eval loss : {}, Eval accuracy : {} ".format(eval_loss, eval_acc))

Eval loss : 0.03580893203616142, Eval accuracy : 0.9869999885559082 


In [None]:
%tensorboard --logdir=logs

In [None]:
ls -sh ./plogs

total 4.0K
4.0K [0m[01;34mtrain[0m/


In [None]:
path = "saved_model" #save the model

model.save(path, save_format="tf")



In [None]:
# Actually if have only one device but on the big project you have multiple devices 
# without distributed system
unreplicated_model = tf.keras.models.load_model(path)

unreplicated_model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)
eval_loss, eval_acc = unreplicated_model.evaluate(eval_dataset)
print("Eval loss: {}".format(eval_loss,eval_acc))

Eval loss: 0.03580893203616142


In [None]:
# with distributed system
with strategy.scope():
  replicated_model = tf.keras.models.load_model(path)
  replicated_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                           optimizer=tf.keras.optimizers.Adam(),
                           metrics=["accuracy"])

eval_loss, eval_acc = replicated_model.evaluate(eval_dataset)
print("Eval loss: {}, Eval Accuracy: {}".format(eval_loss, eval_acc)) 

Eval loss: 0.03580893203616142, Eval Accuracy: 0.9869999885559082
