<a href="https://colab.research.google.com/github/iypc-team/CoLab/blob/master/Untitled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, shutil
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
contentPth = os.getcwd()
pythonFilesPth = '/content/drive/My Drive/PythonFiles'
print(contentPth)

deletionPth = '/content/sample_data'
if os.path.exists(deletionPth):
    shutil.rmtree(deletionPth)
else: pass

if os.path.exists(pythonFilesPth):
    os.chdir(pythonFilesPth)
    print(f'cwd: {os.getcwd()}')

from CleanDrive import cd
cd.cleanDrive()
import ImportDriveFiles

os.chdir(contentPth)
from FunctionTimer import ft

In [None]:
from __future__ import absolute_import
import os, time
try:
    import tensorflow as tf
    print(f'tf version: {tf.__version__}')
except ModuleNotFoundError as err:
    print(err)
    %pip install tensorflow
    import tensorflow as tf

try:
    import tensorflow_datasets as tfds
    print(f'tfds version: {tfds.__version__}')
except ModuleNotFoundError as err:
    print(err)
    %pip install tensorflow_datasets
    import tensorflow_datasets as tfds

In [None]:
startTime=time.time()
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
# print("All devices: ", tf.config.list_logical_devices('TPU'))
ft.functionTimer(startTime, 2)

In [None]:
logicalDevices = tf.config.list_logical_devices()
for device in sorted(logicalDevices):
    print(device)
print()

physicalDevices = tf.config.list_physical_devices()
for device in sorted(physicalDevices):
    print(device)

In [None]:
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
with tf.device('/TPU:0'):
  c = tf.matmul(a, b)
print("c device: ", c.device)
print(c)

In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
startTime = time.time()
@tf.function
def matmul_fn(x, y):
    z = tf.matmul(x, y)
    return z

z = strategy.run(matmul_fn, args=(a, b))
print(z)
ft.functionTimer(startTime, 2)

In [None]:
def create_model():
    return tf.keras.Sequential(
        [tf.keras.layers.Conv2D(256, 3, activation='relu',
                                input_shape=(28, 28, 1)),
         tf.keras.layers.Conv2D(256, 3, activation='relu'),
         tf.keras.layers.Flatten(),
         tf.keras.layers.Dense(256, activation='relu'),
         tf.keras.layers.Dense(128, activation='relu'),
         tf.keras.layers.Dense(10)])

In [None]:
global dataset
def get_dataset(batch_size, is_training=True):
    split = 'train' if is_training else 'test'
    dataset, info = tfds.load(name='mnist', split=split, with_info=True,as_supervised=True, try_gcs=True)

    def scale(image, label):
        image = tf.cast(image, tf.float32)
        image /= 255.0
        
        return image, label

    dataset = dataset.map(scale)

    # Only shuffle and repeat the dataset in training. The advantage to have a
    # infinite dataset for training is to avoid the potential last partial batch
    # in each epoch, so users don't need to think about scaling the gradients
    # based on the actual batch size.
    if is_training:
        dataset = dataset.shuffle(10000)
        dataset = dataset.repeat()

    dataset = dataset.batch(batch_size)

    return dataset

In [None]:
startTime = time.time()
with strategy.scope(): # roughly 1 minute 0.6 seconds
    model = create_model()
    model.compile(optimizer='adam',
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['sparse_categorical_accuracy'])

batch_size = 200
steps_per_epoch = 60000 // batch_size
validation_steps = 10000 // batch_size

train_dataset = get_dataset(batch_size, is_training=True)
test_dataset = get_dataset(batch_size, is_training=False)

model.fit(train_dataset,
          epochs=5,
          steps_per_epoch=steps_per_epoch,
          validation_data=test_dataset, 
          validation_steps=validation_steps)
ft.functionTimer(start=startTime, roundedTo=2)

In [None]:
startTime = time.time()
# with tf.device('CPU:0'): # Roughly 11 minutes 26 seconds
with strategy.scope(): # Roughly 35.4 seconds wow!
    model = create_model()
    model.compile(optimizer='adam',
                  # Anything between 2 and `steps_per_epoch` could help here.
                  steps_per_execution = 50,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['sparse_categorical_accuracy'])

model.fit(train_dataset,
          epochs=5,
          steps_per_epoch=steps_per_epoch,
          validation_data=test_dataset,
          validation_steps=validation_steps)

ft.functionTimer(start=startTime, roundedTo=2)

In [None]:
# Create the model, optimizer and metrics inside strategy scope, so that the
# variables can be mirrored on each device.
with strategy.scope():
  model = create_model()
  optimizer = tf.keras.optimizers.Adam()
  training_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
  training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
      'training_accuracy', dtype=tf.float32)

# Calculate per replica batch size, and distribute the datasets on each TPU
# worker.
per_replica_batch_size = batch_size // strategy.num_replicas_in_sync

train_dataset = strategy.distribute_datasets_from_function(
    lambda _: get_dataset(per_replica_batch_size, is_training=True))

@tf.function
def train_step(iterator):
  """The step function for one training step"""

  def step_fn(inputs):
    """The computation to run on each TPU device."""
    images, labels = inputs
    with tf.GradientTape() as tape:
      logits = model(images, training=True)
      loss = tf.keras.losses.sparse_categorical_crossentropy(
          labels, logits, from_logits=True)
      loss = tf.nn.compute_average_loss(loss, global_batch_size=batch_size)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(list(zip(grads, model.trainable_variables)))
    training_loss.update_state(loss * strategy.num_replicas_in_sync)
    training_accuracy.update_state(labels, logits)

  strategy.run(step_fn, args=(next(iterator),))

In [None]:
steps_per_eval = 10000 // batch_size
train_iterator = iter(train_dataset)

startTime = time.time()
for epoch in range(5):
    epochTime=time.time()
    print('Epoch: {}/5'.format(epoch))
    for step in range(steps_per_epoch):
        train_step(train_iterator)
    print('step: {}, loss: {}, accuracy: {}%'.format(
        optimizer.iterations.numpy(),
        round(float(training_loss.result()), 4),
        round(float(training_accuracy.result()) * 100, 1)))
    print(f'this Epoch')
    ft.functionTimer(epochTime)
    # ft.functionTimer()
    training_loss.reset_states()
    training_accuracy.reset_states()
print(f'total time')
ft.functionTimer(startTime, 2)

In [None]:
@tf.function
def train_multiple_steps(iterator, steps):
  """The step function for one training step"""

  def step_fn(inputs):
    """The computation to run on each TPU device."""
    images, labels = inputs
    with tf.GradientTape() as tape:
      logits = model(images, training=True)
      loss = tf.keras.losses.sparse_categorical_crossentropy(
          labels, logits, from_logits=True)
      loss = tf.nn.compute_average_loss(loss, global_batch_size=batch_size)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(list(zip(grads, model.trainable_variables)))
    training_loss.update_state(loss * strategy.num_replicas_in_sync)
    training_accuracy.update_state(labels, logits)

  for _ in tf.range(steps):
    strategy.run(step_fn, args=(next(iterator),))

# Convert `steps_per_epoch` to `tf.Tensor` so the `tf.function` won't get 
# retraced if the value changes.
train_multiple_steps(train_iterator, tf.convert_to_tensor(steps_per_epoch))

print('step: {}, loss: {}, accuracy: {}%'.format(
      optimizer.iterations.numpy(),
      round(float(training_loss.result()), 4),
      round(float(training_accuracy.result()) * 100, 2)))