In [1]:
from azureml.core import Experiment
from azureml.core import Workspace, Run
from azureml.core import Environment
from azureml.core import Dataset, Datastore

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import ScriptRunConfig

In [2]:
from azureml.core.compute import ComputeInstance

In [3]:
workspace = Workspace.from_config()

In [4]:
instance = ComputeTarget(workspace=workspace, name='gpu-v100-low')

In [5]:
dataset = Dataset.get_by_name(workspace, name='recursionbio')

In [6]:
dataset

{
  "source": [
    "('codefilestore', '/Users/kjaanson/recursionpharma/input/recbio/**')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "ee33756c-ab30-45ec-adf9-5b93629bbbd8",
    "name": "recursionbio",
    "version": 2,
    "workspace": "Workspace.create(name='bioworks-azureml', subscription_id='73f2271c-beee-4233-b5dd-202b520bf0db', resource_group='mslearn-azureml')"
  }
}

In [7]:
env_list = Environment.list(workspace)

In [8]:
env_list.keys()

dict_keys(['AzureML-VowpalWabbit-8.8.0', 'AzureML-PyTorch-1.3-CPU', 'AzureML-Minimal', 'AzureML-Tutorial', 'AzureML-PyTorch-1.5-GPU', 'AzureML-TensorFlow-2.2-CPU', 'AzureML-PyTorch-1.6-CPU', 'AzureML-PyTorch-1.5-CPU', 'AzureML-TensorFlow-2.2-GPU', 'AzureML-PyTorch-1.6-GPU', 'AzureML-TensorFlow-2.3-CPU', 'AzureML-Triton', 'AzureML-TensorFlow-2.3-GPU', 'AzureML-DeepSpeed-0.3-GPU', 'AzureML-Pytorch1.7-Cuda11-OpenMpi4.1.0-py36', 'AzureML-Scikit-learn0.24-Cuda11-OpenMpi4.1.0-py36', 'AzureML-TensorFlow2.4-Cuda11-OpenMpi4.1.0-py36', 'AzureML-TensorFlow-1.15-Inference-CPU', 'AzureML-XGBoost-0.9-Inference-CPU', 'AzureML-PyTorch-1.6-Inference-CPU', 'AzureML-Minimal-Inference-CPU'])

In [9]:
tf_env = Environment.get(workspace=workspace, name='AzureML-TensorFlow-2.3-GPU')
tf_env = tf_env.clone(new_name='recbio-tf-2.3-efficientnet')

In [10]:
tf_env.python.conda_dependencies.add_conda_package('scikit-learn')
tf_env.python.conda_dependencies.add_conda_package('scipy')
tf_env.python.conda_dependencies.add_conda_package('matplotlib')

In [11]:
tf_env.python.conda_dependencies.add_pip_package('horovod==0.19.5')
tf_env.python.conda_dependencies.add_pip_package('retry')

In [12]:
tf_env

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04:20210405.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "recbio-tf-2.3-efficientnet",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "conda-forge"

In [13]:
train_scr = ScriptRunConfig(
    source_directory='./scripts',
    script='train_cnn.py',
    arguments=['--data-path', dataset.as_mount(),
               '--epochs', 10,
               '--batch', 16,
               '--train-frac', 1.0],
    compute_target=instance,
    environment=tf_env
)

In [14]:
train_scr

<azureml.core.script_run_config.ScriptRunConfig at 0x7fdec44063c8>

In [15]:
run = Experiment(workspace=workspace, name='recbio-effnet-model').submit(train_scr)

In [None]:
run.wait_for_completion(show_output=True)

RunId: recbio-effnet-model_1619361426_00500871
Web View: https://ml.azure.com/experiments/recbio-effnet-model/runs/recbio-effnet-model_1619361426_00500871?wsid=/subscriptions/73f2271c-beee-4233-b5dd-202b520bf0db/resourcegroups/mslearn-azureml/workspaces/bioworks-azureml

Streaming azureml-logs/55_azureml-execution-tvmps_2278f91f4df062613fa3f1cd2bacba07a59476e121f2bada2fa81817643ac301_p.txt

2021-04-25T14:41:22Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/bioworks-azureml/azureml/recbio-effnet-model_1619361426_00500871/mounts/workspaceblobstore
2021-04-25T14:41:22Z Failed to start nvidia-fabricmanager due to exit status 5 with output Failed to start nvidia-fabricmanager.service: Unit nvidia-fabricmanager.service not found.
. Please ignore this if the GPUs don't utilize NVIDIA® NVLink® switches.
2021-04-25T14:41:22Z Starting output-watcher...
2021-04-25T14:41:22Z IsDedicatedCompute == False, starting polling for Low-Pri Preemption
2021-04-25T14: