# How to do distributed training on Azure ML service with tf.estimator & tf.data  dataset using horovod 

This notebook demonstrates how to perform distributed training using tf.estimator and tf.data with horovod in Azure Machine learning 

Let's import the required Azure ML Packages and defines the needed constants...

In [None]:
import azureml
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import TensorFlow
from azureml.widgets import RunDetails
from azureml.core.runconfig import MpiConfiguration


SUBSCRIPTION_ID = ""
RESOURCE_GROUP = ""
WORKSPACE_NAME = ""

CLUSTER_NAME = "gpucluster"
PROJECT_FOLDER = "./"

print("SDK version:", azureml.core.VERSION)

## Initialize Azure ML workspace

In [None]:
ws = Workspace(subscription_id = SUBSCRIPTION_ID, 
               resource_group =RESOURCE_GROUP , 
               workspace_name = WORKSPACE_NAME
              )
    
ws.write_config()

## Initialize Azure ML compute

In [None]:
try:
    gpu_cluster = ComputeTarget(workspace=ws,
                                name=CLUSTER_NAME
                               )
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',
                                                           max_nodes=2
                                                          )
    gpu_cluster = ComputeTarget.create(ws, 
                                       gpu_cluster_name,
                                       compute_config
                                      )

gpu_cluster.wait_for_completion(show_output=True)

## Initialize Tensorflow estimator

In [None]:
estimator = TensorFlow(source_directory=PROJECT_FOLDER,
                       compute_target=gpu_cluster,
                       entry_script='train.py',
                       node_count=2,
                       distributed_training=MpiConfiguration(),
                       use_gpu=True
                      )

## Create experiment and submit run for execution

In [None]:
experiment = Experiment(ws, name="tf_estimator_horovod")
run = experiment.submit(estimator)
RunDetails(run).show()