Entrenamiento personalizado con tf.distribute.Strategy --- 0:00 min
===

* Última modificación: Marzo 1, 2022 | YouTube

Importación de librerías
---

In [2]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf

Descarga de datos
---

In [3]:
import numpy as np

(train_images, train_labels), (
    test_images,
    test_labels,
) = tf.keras.datasets.fashion_mnist.load_data()

train_images = train_images[..., None]
test_images = test_images[..., None]


train_images = train_images / np.float32(255)
test_images = test_images / np.float32(255)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


NameError: name 'np' is not defined

Creación de la estrategia
---

In [4]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


In [5]:
print("Number of devices: {}".format(strategy.num_replicas_in_sync))

Number of devices: 1


Tubería de entrada
---

In [7]:
BUFFER_SIZE = len(train_images)

BATCH_SIZE_PER_REPLICA = 64
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

EPOCHS = 10

In [8]:
train_dataset = (
    tf.data.Dataset.from_tensor_slices((train_images, train_labels))
    .shuffle(BUFFER_SIZE)
    .batch(GLOBAL_BATCH_SIZE)
)

test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(
    GLOBAL_BATCH_SIZE
)

train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)

test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)

Creación del modelo
---

In [10]:
def create_model():
    model = tf.keras.Sequential(
        [
            tf.keras.layers.Conv2D(32, 3, activation="relu"),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Conv2D(64, 3, activation="relu"),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(10),
        ]
    )

    return model

In [9]:
#
# Directorio para almacenar los checkpoints
#
checkpoint_dir = "/tmp/training_checkpoints"

#
# Nombres de los archivos
#
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

Definición de la función de pérdida
----

In [11]:
with strategy.scope():

    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE
    )

    def compute_loss(labels, predictions):
        per_example_loss = loss_object(labels, predictions)
        return tf.nn.compute_average_loss(
            per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE
        )

Métricas
---

In [12]:
with strategy.scope():

    test_loss = tf.keras.metrics.Mean(name="test_loss")

    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")

    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="test_accuracy")

Ciclo de entrenamiento
---

In [13]:
with strategy.scope():

    model = create_model()

    optimizer = tf.keras.optimizers.Adam()

    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

In [14]:
def train_step(inputs):

    images, labels = inputs

    with tf.GradientTape() as tape:
        predictions = model(images, training=True)
        loss = compute_loss(labels, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_accuracy.update_state(labels, predictions)
    return loss


def test_step(inputs):

    images, labels = inputs

    predictions = model(images, training=False)
    t_loss = loss_object(labels, predictions)

    test_loss.update_state(t_loss)
    test_accuracy.update_state(labels, predictions)

In [15]:
@tf.function
def distributed_train_step(dataset_inputs):
    per_replica_losses = strategy.run(train_step, args=(dataset_inputs,))
    return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)


@tf.function
def distributed_test_step(dataset_inputs):
    return strategy.run(test_step, args=(dataset_inputs,))


for epoch in range(EPOCHS):
    #
    # Entrenamiento
    #
    total_loss = 0.0
    num_batches = 0
    for x in train_dist_dataset:
        total_loss += distributed_train_step(x)
        num_batches += 1
    train_loss = total_loss / num_batches

    #
    # Prueba
    #
    for x in test_dist_dataset:
        distributed_test_step(x)

    if epoch % 2 == 0:
        checkpoint.save(checkpoint_prefix)

    template = "Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, " "Test Accuracy: {}"
    print(
        template.format(
            epoch + 1,
            train_loss,
            train_accuracy.result() * 100,
            test_loss.result(),
            test_accuracy.result() * 100,
        )
    )

    test_loss.reset_states()
    train_accuracy.reset_states()
    test_accuracy.reset_states()

INFO:tensorflow:Error reported to Coordinator: in user code:

    File "/tmp/ipykernel_2148/3476027335.py", line 6, in train_step  *
        predictions = model(images, training=True)
    File "/usr/local/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler  **
        raise e.with_traceback(filtered_tb) from None

    TypeError: Exception encountered when calling layer "conv2d" (type Conv2D).
    
    Value passed to parameter 'input' has DataType uint8 not in list of allowed values: float16, bfloat16, float32, float64, int32
    
    Call arguments received:
      • inputs=tf.Tensor(shape=(64, 28, 28, 1), dtype=uint8)
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception
    yield
  File "/usr/local/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_run.py", line 342, in run
    self.main_result = self.main_fn(*self.main_args, **self.

TypeError: in user code:

    File "/tmp/ipykernel_2148/4171982643.py", line 3, in distributed_train_step  *
        per_replica_losses = strategy.run(train_step, args=(dataset_inputs,))
    File "/tmp/ipykernel_2148/3476027335.py", line 6, in train_step  *
        predictions = model(images, training=True)
    File "/usr/local/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler  **
        raise e.with_traceback(filtered_tb) from None

    TypeError: Exception encountered when calling layer "conv2d" (type Conv2D).
    
    Value passed to parameter 'input' has DataType uint8 not in list of allowed values: float16, bfloat16, float32, float64, int32
    
    Call arguments received:
      • inputs=tf.Tensor(shape=(64, 28, 28, 1), dtype=uint8)


Restauración del modelo desde el último checkpoint y prueba
---

In [None]:
eval_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="eval_accuracy")

new_model = create_model()
new_optimizer = tf.keras.optimizers.Adam()

test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(
    GLOBAL_BATCH_SIZE
)

In [None]:
@tf.function
def eval_step(images, labels):
    predictions = new_model(images, training=False)
    eval_accuracy(labels, predictions)

Mecanismos alternativos para iterar sobre un dataset
---

**Iteradores**

In [None]:
for _ in range(EPOCHS):
    total_loss = 0.0
    num_batches = 0
    train_iter = iter(train_dist_dataset)

    for _ in range(10):
        total_loss += distributed_train_step(next(train_iter))
        num_batches += 1
    average_train_loss = total_loss / num_batches

    template = "Epoch {}, Loss: {}, Accuracy: {}"
    print(template.format(epoch + 1, average_train_loss, train_accuracy.result() * 100))
    train_accuracy.reset_states()

**Iteración dentro de tf.function**

In [None]:
@tf.function
def distributed_train_epoch(dataset):
    total_loss = 0.0
    num_batches = 0
    for x in dataset:
        per_replica_losses = strategy.run(train_step, args=(x,))
        total_loss += strategy.reduce(
            tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None
        )
        num_batches += 1
    return total_loss / tf.cast(num_batches, dtype=tf.float32)


for epoch in range(EPOCHS):
    train_loss = distributed_train_epoch(train_dist_dataset)

    template = "Epoch {}, Loss: {}, Accuracy: {}"
    print(template.format(epoch + 1, train_loss, train_accuracy.result() * 100))

    train_accuracy.reset_states()