Entrenamiento distribuido con Keras --- 0:00 min
===

* Última modificación: Marzo 1, 2022 | YouTube

Importación de librerías
---

In [1]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf
import tensorflow_datasets as tfds

%load_ext tensorboard

Descarga del dataset
---

In [2]:
datasets, info = tfds.load(
    name="mnist",
    with_info=True,
    as_supervised=True,
)

mnist_train, mnist_test = datasets["train"], datasets["test"]

Definición de la estrategia de distribución
---

In [3]:
strategy = tf.distribute.MirroredStrategy()

print("Number of devices: {}".format(strategy.num_replicas_in_sync))





INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


Number of devices: 1


Tubería de entrada
---

In [4]:
num_train_examples = info.splits["train"].num_examples
num_test_examples = info.splits["test"].num_examples

BUFFER_SIZE = 10000

BATCH_SIZE_PER_REPLICA = 64
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

In [5]:
def scale(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255

    return image, label

In [6]:
train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [7]:
eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)

Creación del modelo
---

In [8]:
with strategy.scope():
    model = tf.keras.Sequential(
        [
            tf.keras.layers.Conv2D(32, 3, activation="relu", input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(10),
        ]
    )

    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.Adam(),
        metrics=["accuracy"],
    )

Definición de los callbacks
---

In [9]:
#
# Define el directorio para almacenar los checkpoints
#
checkpoint_dir = "/tmp/distributed_training/training_checkpoints"

#
# Define el nombre de los archivos
#
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

In [10]:
#
# Define la función para decrementar la tasa  de aprendizaje.
#
def decay(epoch):
    if epoch < 3:
        return 1e-3
    elif epoch >= 3 and epoch < 7:
        return 1e-4
    else:
        return 1e-5

In [11]:
#
# Define un callback para imprimir la tasa de aprendizaje el final de cada epoch.
#
class PrintLR(tf.keras.callbacks.Callback):
    #
    def on_epoch_end(self, epoch, logs=None):
        print(
            "\nLearning rate for epoch {} is {}".format(
                epoch + 1, model.optimizer.lr.numpy()
            )
        )

In [12]:
#
# Crea la lista de callbacks que se usarán en el modelo
#
callbacks = [
    #
    tf.keras.callbacks.TensorBoard(log_dir="/tmp/logs"),
    #
    tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_prefix, save_weights_only=True
    ),
    #
    tf.keras.callbacks.LearningRateScheduler(decay),
    #
    PrintLR(),
]

Entrenamiento y evaluación
---

In [13]:
EPOCHS = 12

#
# Entrenamiento
#
model.fit(
    train_dataset,
    epochs=EPOCHS,
    callbacks=callbacks,
)

Epoch 1/12
Learning rate for epoch 1 is 0.0010000000474974513
Epoch 2/12
Learning rate for epoch 2 is 0.0010000000474974513
Epoch 3/12
Learning rate for epoch 3 is 0.0010000000474974513
Epoch 4/12
Learning rate for epoch 4 is 9.999999747378752e-05
Epoch 5/12
Learning rate for epoch 5 is 9.999999747378752e-05
Epoch 6/12
Learning rate for epoch 6 is 9.999999747378752e-05
Epoch 7/12
Learning rate for epoch 7 is 9.999999747378752e-05
Epoch 8/12
Learning rate for epoch 8 is 9.999999747378752e-06
Epoch 9/12
Learning rate for epoch 9 is 9.999999747378752e-06
Epoch 10/12
Learning rate for epoch 10 is 9.999999747378752e-06
Epoch 11/12
Learning rate for epoch 11 is 9.999999747378752e-06
Epoch 12/12
Learning rate for epoch 12 is 9.999999747378752e-06


<keras.callbacks.History at 0x7f5079fb5040>

In [14]:
#
# Directorio con los checkpoints
#
!ls {checkpoint_dir}

checkpoint		     ckpt_4.data-00000-of-00001
ckpt_1.data-00000-of-00001   ckpt_4.index
ckpt_1.index		     ckpt_5.data-00000-of-00001
ckpt_10.data-00000-of-00001  ckpt_5.index
ckpt_10.index		     ckpt_6.data-00000-of-00001
ckpt_11.data-00000-of-00001  ckpt_6.index
ckpt_11.index		     ckpt_7.data-00000-of-00001
ckpt_12.data-00000-of-00001  ckpt_7.index
ckpt_12.index		     ckpt_8.data-00000-of-00001
ckpt_2.data-00000-of-00001   ckpt_8.index
ckpt_2.index		     ckpt_9.data-00000-of-00001
ckpt_3.data-00000-of-00001   ckpt_9.index
ckpt_3.index


In [15]:
#
# Evaluación
#

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

eval_loss, eval_acc = model.evaluate(eval_dataset)

print("Eval loss: {}, Eval accuracy: {}".format(eval_loss, eval_acc))

Eval loss: 0.04153656214475632, Eval accuracy: 0.9857000112533569


Para lanzar TensorBoard use:

```python
%tensorboard --logdir=logs
```

![tensorboard_distributed_training_with_keras](assets/tensorboard_distributed_training_with_keras.png)

In [16]:
%tensorboard --logdir /tmp/logs/

Exportación del modelo
---

In [18]:
path = '/tmp/saved_model/'

model.save(path, save_format='tf')

INFO:tensorflow:Assets written to: /tmp/saved_model/assets


INFO:tensorflow:Assets written to: /tmp/saved_model/assets


In [19]:
#
# Carga desde el directorio
#
unreplicated_model = tf.keras.models.load_model(path)

unreplicated_model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy'])

eval_loss, eval_acc = unreplicated_model.evaluate(eval_dataset)

print('Eval loss: {}, Eval Accuracy: {}'.format(eval_loss, eval_acc))

Eval loss: 0.04153656214475632, Eval Accuracy: 0.9857000112533569
