In [64]:
import os
import azureml.core
from azureml.core import (
    Workspace,
    Experiment,
    Dataset,
    Datastore,
    ComputeTarget,
    Environment,
    ScriptRunConfig,
)
from azureml.data import OutputFileDatasetConfig
from azureml.core.compute import AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.48.0


In [65]:
workspace = Workspace.from_config()

In [66]:
exp = Experiment(workspace=workspace, name="nnunet-devops-test")

In [67]:
output = OutputFileDatasetConfig(name="nnunet_outputs", destination=None, source="/output/") # source directory should be same with RESULTS_DIR in docker

In [77]:
use_gpu = False
max_number_of_instances = 1

# choose a name for your cluster
cluster_name = "gpu-cluster" if use_gpu else "cpu-cluster"

found = False
# Check if this compute target already exists in the workspace.
cts = workspace.compute_targets
if cluster_name in cts and cts[cluster_name].type == "AmlCompute":
    found = True
    print("Found existing compute target.")
    compute_target = cts[cluster_name]
if not found:
    print("Creating a new compute target...")
    compute_config = AmlCompute.provisioning_configuration(
        vm_size= "STANDARD_NC6" if use_gpu else "STANDARD_D4_V2",
        # vm_priority = 'lowpriority', # optional
        max_nodes=max_number_of_instances
    )

    # Create the cluster.
    compute_target = ComputeTarget.create(workspace, cluster_name, compute_config)

    # Can poll for a minimum number of nodes and for a specific timeout.
    # If no min_node_count is provided, it will use the scale settings for the cluster.
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=10
    )
# For a more detailed view of current AmlCompute status, use get_status().print(compute_target.get_status().serialize())
print(compute_target)

Found existing compute target.
AmlCompute(workspace=Workspace.create(name='azure-ml-test-workspace', subscription_id='e5fb5c5f-cb5c-42c8-86b6-cf4fd5aac11b', resource_group='azureml-test'), name=cpu-cluster, id=/subscriptions/e5fb5c5f-cb5c-42c8-86b6-cf4fd5aac11b/resourceGroups/azureml-test/providers/Microsoft.MachineLearningServices/workspaces/azure-ml-test-workspace/computes/cpu-cluster, type=AmlCompute, provisioning_state=Deleting, location=eastus2, tags={})


In [76]:
# figure out how to mount script folder
script_folder = "./nnunet_src"

prep_step = PythonScriptStep(
    name="prepare step",
    script_name="prepare.py",
    arguments=[], # no arguments for now
    source_directory=script_folder,
    compute_target=compute_target,
    allow_reuse=True,
)

In [70]:
print(compute_target)

AmlCompute(workspace=Workspace.create(name='azure-ml-test-workspace', subscription_id='e5fb5c5f-cb5c-42c8-86b6-cf4fd5aac11b', resource_group='azureml-test'), name=cpu-cluster, id=/subscriptions/e5fb5c5f-cb5c-42c8-86b6-cf4fd5aac11b/resourceGroups/azureml-test/providers/Microsoft.MachineLearningServices/workspaces/azure-ml-test-workspace/computes/cpu-cluster, type=AmlCompute, provisioning_state=Deleting, location=eastus2, tags={})


In [71]:
docker_env = Environment("nnunet-docker")
docker_env.docker.base_image = "auzremltestregistry.azurecr.io/nnunet" # example : "fastdotai/fastai2:latest"
docker_env.python.user_managed_dependencies = True

train_cfg = ScriptRunConfig(
    source_directory=script_folder,
    script="train.py",
    arguments=[output],
    compute_target=compute_target,
    environment=docker_env,
)

# create the training step :
train_step = PythonScriptStep(
    name="training step",
    arguments=[output],# no asrguments for now, since azure datastore wont work well with colab
    source_directory=train_cfg.source_directory,
    script_name=train_cfg.script,
    runconfig=train_cfg.run_config,
)

In [72]:
pipeline = Pipeline(workspace, steps=[prep_step, train_step])
run = exp.submit(pipeline)

Created step prepare step [94178a93][e555cbd7-bc34-4a1b-ab0d-c9583287ab67], (This step is eligible to reuse a previous run's output)Created step training step [08dca8fa][4231f348-7cf0-4e06-b6b0-825b9b15d8d5], (This step is eligible to reuse a previous run's output)

Submitted PipelineRun ebc12c5e-be45-448b-99ff-83c15c27f84d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/ebc12c5e-be45-448b-99ff-83c15c27f84d?wsid=/subscriptions/e5fb5c5f-cb5c-42c8-86b6-cf4fd5aac11b/resourcegroups/azureml-test/workspaces/azure-ml-test-workspace&tid=610ba57f-df8b-4dc2-8ddf-2532d017cd71
