In [None]:
pip install horovod[tensorflow] tensorflow


In [None]:
import tensorflow as tf
import horovod.tensorflow as hvd


In [None]:
# Initialize Horovod
hvd.init()

# Pin GPU to be used by this process
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

print(f"Horovod initialized with rank {hvd.rank()} out of {hvd.size()} processes.")


In [None]:
# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

# Normalize pixel values to range [0, 1]
x_train, x_test = x_train / 255.0, x_test / 255.0

# One-hot encode labels
y_train, y_test = tf.keras.utils.to_categorical(y_train, 10), tf.keras.utils.to_categorical(y_test, 10)

# Create a distributed training dataset
batch_size = 128
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(batch_size)
train_dataset = train_dataset.shard(num_shards=hvd.size(), index=hvd.rank())
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)


In [None]:
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    return model

model = create_model()


In [None]:
# Adjust learning rate for distributed training
opt = tf.keras.optimizers.Adam(0.001 * hvd.size())

# Wrap the optimizer with Horovod's DistributedOptimizer
opt = hvd.DistributedOptimizer(opt)

# Compile the model
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
callbacks = [
    hvd.callbacks.BroadcastGlobalVariablesCallback(0),  # Sync initial variables
    hvd.callbacks.MetricAverageCallback()  # Average metrics across workers
]

# Save checkpoints only on the root rank
if hvd.rank() == 0:
    callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))


In [None]:
# Train the model
history = model.fit(train_dataset, 
                    epochs=10, 
                    validation_data=test_dataset, 
                    callbacks=callbacks, 
                    verbose=1 if hvd.rank() == 0 else 0)


In [None]:
if hvd.rank() == 0:
    test_loss, test_acc = model.evaluate(test_dataset)
    print(f"Test Accuracy: {test_acc:.4f}")


In [None]:
horovodrun -np 4 -H localhost:4 python train.py


In [None]:
horovodrun -np 8 -H localhost:4,node2:4 python train.py
