In [1]:
import logging
import os
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.python.client import device_lib

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s')
logging.info("Tensorflow Version: {}".format(tf.__version__))
logging.info("GPU {} available.".format(\
  "is" if tf.config.experimental.list_physical_devices("xGPU") else "is not"))

tfds.disable_progress_bar()

device_lib.list_local_devices()

2020-11-11 07:31:30,930 - INFO: Tensorflow Version: 2.3.0
2020-11-11 07:31:31,018 - INFO: GPU is not available.


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 15829556270385733425, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 16022850918267738524
 physical_device_desc: "device: XLA_CPU device"]

A way of checking whether using the cloud TPU runtime or not.

In [2]:
TPUFLAG = 'COLAB_TPU_ADDR'

if TPUFLAG not in os.environ:
  print("Error: Not connected to a TPU runtime.")
else:
  tpuAddress = "grpc://{}".format(os.environ[TPUFLAG])
  print("Cloud TPU Address: {}".format(tpuAddress))

Cloud TPU Address: grpc://10.0.7.226:8470


# A Demo using Tensorflow

## Using TFDS (**not available**)

Now the TPU runtime can't support TFDS well. Some operations in TFDS are not implemented or not fully supported in the TPU runtime. The following scripts are supported in the GPU runtime but not in TPU.

In [None]:
datasets, meta = tfds.load("fashion_mnist", as_supervised=True, with_info=True)

In [None]:
train, test = datasets["train"], datasets["test"]
for _img, _label in train.take(1):
  print("Image value ranges: {} to {}".format(np.min(_img), np.max(_img)))
  print("Image shape: {}".format(_img.shape))
  print("Image label: {}".format(_label))

print("Number of Train: {}".format(meta.splits["train"].num_examples))
print("Number of Test: {}".format(meta.splits["test"].num_examples))

In [None]:
def normalize(imgs, labels):
  imgs = tf.cast(imgs, tf.float32)
  imgs = (imgs - 127.5) / 127.5
  return imgs, labels

_train = train.map(normalize)
for _img, _label in _train.take(1):
  print("Image value ranges: {} to {}".format(np.min(_img), np.max(_img)))
  print("Image shape: {}".format(_img.shape))
  print("Image label: {}".format(_label))

In [None]:
trainDS = train.map(normalize).cache().shuffle(10240).batch(512)
testDS = test.map(normalize).cache().batch(512)

In [None]:
for _img, _label in trainDS.take(1):
  print("Image value ranges: {} to {}".format(np.min(_img), np.max(_img)))
  print("Image shape: {}".format(_img.shape))
  print("Image label: {}".format(_label))

## Using in-memory Datasets

Instead of using TFDS as the data pipeline, you can preprocess and construct the data pipeline from a CPU-based data generator, including the in-memory loader, etc.

In [3]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [4]:
# normalize the data
x_train = np.expand_dims((x_train - 127.5) / 127.5, axis=-1)
x_test = np.expand_dims((x_test - 127.5) / 127.5, axis=-1)

In [5]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
print(np.max(x_train), np.min(x_train))

(60000, 28, 28, 1) (60000,) (10000, 28, 28, 1) (10000,)
1.0 -1.0


## Define the Model

In [6]:
def buildModel():
  def modelBody(inputs):
    x = tf.keras.layers.Conv2D(64, (3, 3), (1, 1), padding="same")(inputs)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), padding="same")(x)
    x = tf.keras.layers.Activation('relu')(x)

    x = tf.keras.layers.Conv2D(128, (5, 5), (1, 1), padding="same")(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), padding="same")(x)
    x = tf.keras.layers.Activation('relu')(x)

    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(10)(x)
    y = tf.keras.layers.Activation('softmax')(x)
    return y

  inputs = tf.keras.layers.Input(shape=(28, 28, 1))
  outputs = modelBody(inputs)
  model = tf.keras.Model(inputs, outputs)
  return model

cnnModel = buildModel()
cnnModel.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 28, 28, 1)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 28, 28, 64)        640       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 64)        0         
_________________________________________________________________
activation (Activation)      (None, 14, 14, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 14, 14, 128)       204928    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 7, 7, 128)         0         
_________________________________________________________________
activation_1 (Activation)    (None, 7, 7, 128)        

In [7]:
# an simple inference
y = cnnModel.predict(x_train[:32, ...])
print(y.shape)

(32, 10)


## Define the Strategy

In this step, we are going to define the strategy of how to use the TPU system as our training runtime.

In [8]:
# connect to the Cloud TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

# connect to and initialize the TPU system
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# start a TPU-based strategy
strategy = tf.distribute.experimental.TPUStrategy(tpu)

print("There is(are) {} TPU runtime(s).".format(strategy.num_replicas_in_sync))

2020-11-11 07:31:50,167 - INFO: Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


INFO:tensorflow:Initializing the TPU system: grpc://10.0.7.226:8470


2020-11-11 07:31:50,169 - INFO: Initializing the TPU system: grpc://10.0.7.226:8470


INFO:tensorflow:Clearing out eager caches


2020-11-11 07:32:06,585 - INFO: Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


2020-11-11 07:32:06,592 - INFO: Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


2020-11-11 07:32:06,603 - INFO: Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


2020-11-11 07:32:06,606 - INFO: *** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


2020-11-11 07:32:06,608 - INFO: *** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


2020-11-11 07:32:06,613 - INFO: *** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


2020-11-11 07:32:06,615 - INFO: *** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


2020-11-11 07:32:06,618 - INFO: *** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


2020-11-11 07:32:06,623 - INFO: *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


2020-11-11 07:32:06,625 - INFO: *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


2020-11-11 07:32:06,627 - INFO: *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


2020-11-11 07:32:06,628 - INFO: *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


2020-11-11 07:32:06,630 - INFO: *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


2020-11-11 07:32:06,633 - INFO: *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


2020-11-11 07:32:06,634 - INFO: *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


2020-11-11 07:32:06,636 - INFO: *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


2020-11-11 07:32:06,637 - INFO: *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


2020-11-11 07:32:06,639 - INFO: *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


2020-11-11 07:32:06,640 - INFO: *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


There is(are) 8 TPU runtime(s).


## Training via TF.Keras

In tensorflow, you can build the model under the scope of the TPU strategy to make the training using the TPU system. The following is an example using Tensorflow.Keras to train a model under the TPU runtime.

In [24]:
with strategy.scope():
  model = buildModel()
  model.compile(optimizer=tf.keras.optimizers.Adam(),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [25]:
model.fit(x=x_train.astype(np.float32), 
          y=y_train.astype(np.float32), 
          epochs=10)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff1ca177da0>

## Training via TF.Core

You can also train a model using the native Tensorflow Core. **Notice the training using the TPU is similar to the one using multiple workers. You have to customize the operation calculating the loss value.**

In [18]:
EPOCHS = 15
BATCHSIZE = 1000 * strategy.num_replicas_in_sync
TRAINSTEP = x_train.shape[0] // BATCHSIZE
TESTSTEP = x_test.shape[0] // BATCHSIZE

# TPU doesn't support the labels in type uint8
y_train = y_train.astype('int')
y_test = y_test.astype('int')

logging.info("There are {} workers.".format(strategy.num_replicas_in_sync))
logging.info("Batch size: {}".format(BATCHSIZE))

2020-11-11 07:42:22,402 - INFO: There are 8 workers.
2020-11-11 07:42:22,404 - INFO: Batch size: 8000


In [19]:
with strategy.scope():
  lossObject = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.NONE)
  
  def computeOneWorkerLoss(labels, predictions):
    lossPerExample = lossObject(labels, predictions)
    return tf.nn.compute_average_loss(
      per_example_loss=lossPerExample,
      global_batch_size=BATCHSIZE)

  lossTrain = tf.keras.metrics.Mean()
  lossTest = tf.keras.metrics.Mean()

  accTrain = tf.keras.metrics.SparseCategoricalAccuracy()
  accTest = tf.keras.metrics.SparseCategoricalAccuracy()

  optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
  modelCore = buildModel()

In [20]:
with strategy.scope():
  @tf.function
  def trainStep(inputs, labels):
    with tf.GradientTape() as tape:
      outputs = modelCore(inputs, training=True)
      losses = computeOneWorkerLoss(labels, outputs)
    
    grads = tape.gradient(losses, modelCore.trainable_variables)
    optimizer.apply_gradients(zip(grads, modelCore.trainable_variables))

    accTrain.update_state(labels, outputs)
    return losses
  
  @tf.function
  def testStep(inputs, labels):
    outputs = modelCore(inputs, training=False)
    losses = lossObject(labels, outputs)

    accTest.update_state(labels, outputs)
    return losses

  @tf.function
  def distributeTrainStep(inputs, labels):
    lossPerExample = strategy.run(fn=trainStep,
                                  args=(inputs, labels))
    return strategy.reduce(tf.distribute.ReduceOp.SUM,
                           lossPerExample, 
                           axis=None)
    
  @tf.function
  def distributeTestStep(inputs, labels):
    lossPerExample = strategy.run(fn=testStep,
                                  args=(inputs, labels))
    return strategy.reduce(tf.distribute.ReduceOp.SUM,
                           lossPerExample, 
                           axis=None)
  
  for epoch in range(EPOCHS):

    lossTrain.reset_states()
    lossTest.reset_states()
    accTrain.reset_states()
    accTest.reset_states()

    # train step
    for step in range(TRAINSTEP):

      trainStart = 0 + BATCHSIZE * step
      trainEnd = BATCHSIZE* (step + 1)
      trainDataBatch = x_train[trainStart:trainEnd, ...]
      trainLabelBatch = y_train[trainStart:trainEnd, ...]

      trainLosses = distributeTrainStep(trainDataBatch, trainLabelBatch)
      lossTrain.update_state(trainLosses)

    if (epoch + 1) % 2 == 0:
      # test step
      for step in range(TESTSTEP):

        testStart = 0 + BATCHSIZE * step
        testEnd = BATCHSIZE* (step + 1)
        testDataBatch = x_test[testStart:testEnd, ...]
        testLabelBatch = y_test[testStart:testEnd, ...]

        testLosses = distributeTestStep(testDataBatch, testLabelBatch)
        lossTest.update_state(testLosses)      

    lossTrainVal = lossTrain.result()
    lossTestVal = lossTest.result()
    accTrainVal = accTrain.result()
    accTestVal = accTest.result()

    if (epoch + 1) % 2 == 0:
      # test step 
      print("Epoch {}: Loss: {:.4f}, Accuracy: {:.2%}, Test Loss: {:.4f}, Test Accuracy: {:.2%}".format(
        epoch + 1, lossTrainVal, accTrainVal, lossTestVal, accTestVal
      )) 
    else:
      print("Epoch {}: Loss: {:.4f}, Accuracy: {:.2%}".format(
        epoch + 1, lossTrainVal, accTrainVal
      ))   

Epoch 1: Loss: 13.0848, Accuracy: 52.64%
Epoch 2: Loss: 6.3728, Accuracy: 71.62%, Test Loss: 5.5703, Test Accuracy: 75.34%
Epoch 3: Loss: 5.1184, Accuracy: 77.58%
Epoch 4: Loss: 4.4326, Accuracy: 80.11%, Test Loss: 4.2603, Test Accuracy: 80.77%
Epoch 5: Loss: 4.0045, Accuracy: 82.34%
Epoch 6: Loss: 3.7199, Accuracy: 83.68%, Test Loss: 3.7390, Test Accuracy: 83.69%
Epoch 7: Loss: 3.4972, Accuracy: 84.57%
Epoch 8: Loss: 3.3368, Accuracy: 85.29%, Test Loss: 3.4557, Test Accuracy: 84.66%
Epoch 9: Loss: 3.2042, Accuracy: 85.91%
Epoch 10: Loss: 3.0961, Accuracy: 86.45%, Test Loss: 3.2691, Test Accuracy: 85.30%
Epoch 11: Loss: 3.0007, Accuracy: 86.75%
Epoch 12: Loss: 2.9191, Accuracy: 87.18%, Test Loss: 3.1295, Test Accuracy: 86.18%
Epoch 13: Loss: 2.8562, Accuracy: 87.39%
Epoch 14: Loss: 2.8006, Accuracy: 87.62%, Test Loss: 3.0273, Test Accuracy: 86.49%
Epoch 15: Loss: 2.7529, Accuracy: 87.83%
