In [163]:
import os
import azureml.core
from azureml.core.runconfig import JarLibrary
from azureml.core.compute import ComputeTarget, DatabricksCompute
from azureml.exceptions import ComputeTargetException
from azureml.core import Workspace, Environment, Experiment, Datastore, Dataset, ScriptRunConfig
from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput
from azureml.pipeline.steps import DatabricksStep, PythonScriptStep
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference
# from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun, PythonScriptStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.40.0


In [164]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

distributeddeeplearningqmx
deep-learning-challenge
westus2
3df1840f-dd4b-4f54-a831-e20536439b3a


In [166]:
db_compute_name = "ADBCluster" # Databricks compute name

databricks_compute = DatabricksCompute(workspace=ws, name=db_compute_name)
print('Compute target {} already exists'.format(db_compute_name))


Compute target ADBCluster already exists


In [305]:
from azureml.pipeline.core import PipelineParameter
from azureml.pipeline.core.pipeline_output_dataset import PipelineOutputAbstractDataset

def_blob_store = Datastore(ws, "generalpurposeaccount")
print('Datastore {} will be used'.format(def_blob_store.name))

step_1_output = PipelineData("output", datastore=def_blob_store)
# ds_step_1_output = PipelineOutputAbstractDataset(step_1_output) # .as_dataset()
ds_step_1_output = step_1_output.as_dataset()


Datastore generalpurposeaccount will be used


In [294]:
from azureml.core import Dataset

dataset = Dataset.get_by_name(ws, "titanic_from_parquet")
dataset.version

1

In [296]:
cluster_name = "cpu-cluster-4"
compute_target = ComputeTarget(workspace=ws, name=cluster_name)



In [328]:
python_script_name = "adb_run_delta.py"
source_directory = "./scripts"

feature_dataset_name = "feature_titanic"

dbNbStep = DatabricksStep(
    name="ADBFeatureEng",
    outputs=[ds_step_1_output],
    compute_target=databricks_compute,
    existing_cluster_id="0319-164126-ptv2xehc",
    python_script_params=["--feature_set_1", "titanic_from_parquet",
                          "--feature_set_2", "titanic_from_parquet",
                          "--feature_set_3", "titanic_from_parquet",
                          '--output_datastore_name', def_blob_store.name,
                          "--output_feature_set_name", feature_dataset_name],
    permit_cluster_restart=True,
    python_script_name=python_script_name,
    source_directory=source_directory,
    run_name='ADB_Feature_Eng',
    allow_reuse=True
)

In [329]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# tf_env = Environment.get(ws, name='AzureML-lightgbm-3.2-ubuntu18.04-py37-cpu')
# tf_env_c = tf_env.clone("deltalake")

tf_env_c = Environment('deltalake')

conda_dep = CondaDependencies()

conda_dep.add_pip_package("sklearn")
conda_dep.add_pip_package("deltalake")
conda_dep.remove_pip_package('azureml-defaults')
conda_dep.add_pip_package('azureml-core')
conda_dep.add_pip_package('pandas')

# Adds dependencies to PythonSection of myenv
tf_env_c.python.conda_dependencies=conda_dep

tf_env_c = tf_env_c.register(workspace=ws)

rcfg = RunConfiguration()
rcfg.environment = tf_env_c

In [330]:
# ds_step_1_output = step_1_output.as_dataset()
# mnt_ds_step_1_output = ds_step_1_output.as_mount()

aml_step = PythonScriptStep(script_name='aml_run_delta.py',
                                       name="AML Train",
                                       source_directory=source_directory,
                                       inputs=[ds_step_1_output.as_named_input('titanic_ds')],
                                       compute_target=compute_target,
                                       arguments=['--data_folder', ds_step_1_output,
                                                  '--featureset_name', feature_dataset_name,
                                                  '--model_name', 'titanic_model'],
                                       allow_reuse=False,
                                       runconfig=rcfg)


In [320]:
steps = [aml_step]
pipeline = Pipeline(workspace=ws, steps=steps)
pipeline_run = Experiment(ws, 'DB_FeatureStore').submit(pipeline)
# pipeline_run.wait_for_completion()


Created step AML Train [36dfaf6e][dc617fb1-85d0-4c21-ba8f-a3958df0a0e6], (This step will run and generate new outputs)
Created step ADBFeatureEng [bbfb6f77][ee93fe60-1311-430b-9ca9-18a285366b92], (This step will run and generate new outputs)
Submitted PipelineRun d94b4bd3-8a8c-46ce-b746-9e3a5da1eb11
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/d94b4bd3-8a8c-46ce-b746-9e3a5da1eb11?wsid=/subscriptions/3df1840f-dd4b-4f54-a831-e20536439b3a/resourcegroups/deep-learning-challenge/workspaces/distributeddeeplearningqmx&tid=72f988bf-86f1-41af-91ab-2d7cd011db47


In [175]:
pipeline_run

Experiment,Id,Type,Status,Details Page,Docs Page
DB_FeatureStore,ce654450-796a-4696-b69c-5c8d79b80d04,azureml.PipelineRun,Preparing,Link to Azure Machine Learning studio,Link to Documentation


In [331]:
from azureml.data import OutputFileDatasetConfig

step1_output_data = OutputFileDatasetConfig(name="processed_data", destination=(def_blob_store, "mypath/{run-id}/{output-name}")).register_on_complete(name='processed_data', 
                                                         description = 'files from step1').as_upload()

In [348]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

prepped_data_path = OutputFileDatasetConfig(name="output_path").register_on_complete(name='processed_data', description = 'files from step1')

aml_step = PythonScriptStep(script_name='aml_run_delta_automl_prep.py',
                                       name="AML Register Data",
                                       source_directory=source_directory,
                                       inputs=[ds_step_1_output],
                                       outputs=[prepped_data_path],
                                       compute_target=compute_target,
                                       arguments=["--output_path", prepped_data_path,
                                                  '--data_folder', ds_step_1_output],
                                       allow_reuse=False,
                                       runconfig=rcfg)

In [350]:
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep

automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'AUC_weighted'
}
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=prepped_data_path.read_delimited_files(),
                             label_column_name="Survived",   
                             path = source_directory,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [351]:
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output'

metrics_data = PipelineData(name='metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))


In [352]:
automl_train_step = AutoMLStep(name='AutoML_Classification',
                        automl_config=automl_config,
                        passthru_automl_config=False,
                        enable_default_model_output=False,
                        enable_default_metrics_output=False,
                        allow_reuse=True)

In [354]:
steps = [aml_step]
pipeline = Pipeline(workspace=ws, steps=steps)
pipeline_run = Experiment(ws, 'DB_FeatureStore_AutoML').submit(pipeline)
# pipeline_run.wait_for_completion()

pipeline_run

Created step AML Train [97ed61d9][ffe429aa-efe6-43cd-9bbe-72e9761bdbb9], (This step will run and generate new outputs)
Created step ADBFeatureEng [6ce64d95][3745b17c-ac07-43c3-ad08-ae8269bb45cf], (This step is eligible to reuse a previous run's output)
Submitted PipelineRun 8aeb160e-3e38-4a8f-be15-b25e08c8baa6
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8aeb160e-3e38-4a8f-be15-b25e08c8baa6?wsid=/subscriptions/3df1840f-dd4b-4f54-a831-e20536439b3a/resourcegroups/deep-learning-challenge/workspaces/distributeddeeplearningqmx&tid=72f988bf-86f1-41af-91ab-2d7cd011db47


Experiment,Id,Type,Status,Details Page,Docs Page
DB_FeatureStore_AutoML,8aeb160e-3e38-4a8f-be15-b25e08c8baa6,azureml.PipelineRun,Preparing,Link to Azure Machine Learning studio,Link to Documentation
