In [None]:
import azureml.core
print("SDK version:", azureml.core.VERSION)

In [None]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute

cluster_name = "nc24-cluster" # replace with your own gpu cluster
compute_target = ws.compute_targets[cluster_name]
compute_target

## TensorFlow 2.x - distributed training with Horovod

In [None]:
from azureml.core import Experiment

exp = Experiment(workspace=ws, name='tf2-horovod')

In [None]:
from azureml.core import Environment

# get a curated environment (just to use as reference)
reference_env = Environment.get(ws, name='AzureML-TensorFlow-2.1-GPU')
reference_env.save_to_directory(path='./AzureML-TensorFlow-2.1-GPU')

In [None]:
%%writefile conda_dependencies.yml

channels:
- conda-forge
dependencies:
- python=3.6.2
- pip:
  - azureml-core==1.10.0
  - azureml-defaults==1.10.0
  - azureml-telemetry==1.10.0
  - azureml-train-restclients-hyperdrive==1.10.0
  - azureml-train-core==1.10.0
  - tensorflow-gpu==2.2.0
  - horovod==0.19.5

In [None]:
tf_env = Environment.from_conda_specification(name = "tensorflow-gpu-2.2-horovod-0.19.5",
                                             file_path = "./conda_dependencies.yml")
tf_env.docker.enabled = True
tf_env.docker.base_image = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04"
tf_env

**TO DO:**

Once the TensorFlow 2.2 curated environment is updated to use `horovod==0.19.5` rather than `horovod==0.19.1`, update the above environment configuration code to instead use the following:

```python
from azureml.core import Environment
tf_env = Environment.get(ws, name='AzureML-TensorFlow-2.2-GPU')
```

In [None]:
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import MpiConfiguration

distr_config = MpiConfiguration()
distr_config.process_count_per_node = 4 # replace with the # of GPUs in the SKU of your compute cluster

src = ScriptRunConfig(source_directory='.',
                      script='tf2_keras_mnist.py',
                      compute_target=compute_target,
                      environment=tf_env,
                      distributed_job_config=distr_config)

src.run_config.node_count = 2

**TO DO:**

Once `azureml-sdk` version 1.15.0 is released (ETA 10/5/2020), update the above code to the following:

```python
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import MpiConfiguration

distr_config = MpiConfiguration(process_count_per_node=4, node_count=2)

src = ScriptRunConfig(source_directory='.',
                      script='tf2_keras_mnist.py',
                      compute_target=compute_target,
                      environment=tf_env,
                      distributed_job_config=distr_config)
```

In [None]:
run = exp.submit(src)

In [None]:
run.wait_for_completion(show_output=True)