In [476]:
import os
import azureml.core
from azureml.core.runconfig import JarLibrary
from azureml.core.compute import ComputeTarget, DatabricksCompute
from azureml.exceptions import ComputeTargetException
from azureml.core import Workspace, Environment, Experiment, Datastore, Dataset, ScriptRunConfig
from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput
from azureml.pipeline.steps import DatabricksStep, PythonScriptStep
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference
# from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun, PythonScriptStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.40.0


In [477]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

distributeddeeplearningqmx
deep-learning-challenge
westus2
3df1840f-dd4b-4f54-a831-e20536439b3a


In [479]:
db_compute_name = "ADBCluster" # Databricks compute name

databricks_compute = DatabricksCompute(workspace=ws, name=db_compute_name)
print('Compute target {} already exists'.format(db_compute_name))


Compute target ADBCluster already exists


In [480]:
from azureml.pipeline.core import PipelineParameter
from azureml.pipeline.core.pipeline_output_dataset import PipelineOutputAbstractDataset

def_blob_store = Datastore(ws, "generalpurposeaccount")
print('Datastore {} will be used'.format(def_blob_store.name))

step_1_output = PipelineData("output", datastore=def_blob_store)
# ds_step_1_output = PipelineOutputAbstractDataset(step_1_output) # .as_dataset()
ds_step_1_output = step_1_output.as_dataset()


Datastore generalpurposeaccount will be used


In [481]:
from azureml.core import Dataset

dataset = Dataset.get_by_name(ws, "titanic_from_parquet")
dataset.version

1

In [296]:
cluster_name = "cpu-cluster-4"
compute_target = ComputeTarget(workspace=ws, name=cluster_name)



In [484]:
source_directory = "./scripts"

databricks_script_name = "adb_run.py"
aml_script_name = 'aml_run.py'

feature_dataset_name = "feature_titanic"

In [490]:

dbNbStep = DatabricksStep(
    name="ADBFeatureEng",
    outputs=[ds_step_1_output],
    compute_target=databricks_compute,
    existing_cluster_id="0511-174324-3ryh30vo", # "0319-164126-ptv2xehc",
    python_script_params=["--feature_set_1", "titanic_1",
                          "--feature_set_2", "titanic_2",
                          "--feature_set_3", "titanic_3",
                          '--output_datastore_name', def_blob_store.name,
                          "--output_feature_set_name", feature_dataset_name],
    permit_cluster_restart=True,
    python_script_name=databricks_script_name,
    source_directory=source_directory,
    run_name='ADB_Feature_Eng',
    allow_reuse=False
)

In [491]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# tf_env = Environment.get(ws, name='AzureML-lightgbm-3.2-ubuntu18.04-py37-cpu')
# tf_env_c = tf_env.clone("deltalake")

tf_env_c = Environment('deltalake')

conda_dep = CondaDependencies()

conda_dep.add_pip_package("sklearn")
conda_dep.add_pip_package("deltalake")
conda_dep.remove_pip_package('azureml-defaults')
conda_dep.add_pip_package('azureml-core')
conda_dep.add_pip_package('pandas')

# Adds dependencies to PythonSection of myenv
tf_env_c.python.conda_dependencies=conda_dep

tf_env_c = tf_env_c.register(workspace=ws)

rcfg = RunConfiguration()
rcfg.environment = tf_env_c

In [492]:
# ds_step_1_output = step_1_output.as_dataset()
# mnt_ds_step_1_output = ds_step_1_output.as_mount()

aml_step = PythonScriptStep(script_name=aml_script_name,
                                       name="AML Train",
                                       source_directory=source_directory,
                                       inputs=[ds_step_1_output],
                                       compute_target=compute_target,
                                       arguments=['--data_folder', ds_step_1_output,
                                                  '--featureset_name', feature_dataset_name,
                                                  '--model_name', 'titanic_model'],
                                       allow_reuse=False,
                                       runconfig=rcfg)


In [493]:
steps = [aml_step]
pipeline = Pipeline(workspace=ws, steps=steps)
pipeline_run = Experiment(ws, 'DB_FeatureStore').submit(pipeline)
# pipeline_run.wait_for_completion()


Created step AML Train [b2f2ec12][b1c339e0-7551-46c5-ba9e-fb4e8c0065a7], (This step will run and generate new outputs)Created step ADBFeatureEng [b0d56823][4e9d3e14-f965-4432-b74a-f2d31bfc8a2f], (This step will run and generate new outputs)

Submitted PipelineRun 3e26e040-4dfe-4bf7-aa88-048e1aa5961b
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3e26e040-4dfe-4bf7-aa88-048e1aa5961b?wsid=/subscriptions/3df1840f-dd4b-4f54-a831-e20536439b3a/resourcegroups/deep-learning-challenge/workspaces/distributeddeeplearningqmx&tid=72f988bf-86f1-41af-91ab-2d7cd011db47


In [436]:
pipeline_run

Experiment,Id,Type,Status,Details Page,Docs Page
DB_FeatureStore,fbdec19e-8057-4f7e-b8d8-f4d9ee2e21b9,azureml.PipelineRun,Preparing,Link to Azure Machine Learning studio,Link to Documentation


In [469]:
from azureml.core import Model

model = Model(ws, name='titanic_model')
model

Model(workspace=Workspace.create(name='distributeddeeplearningqmx', subscription_id='3df1840f-dd4b-4f54-a831-e20536439b3a', resource_group='deep-learning-challenge'), name=titanic_model, id=titanic_model:11, version=11, tags={'run_id': '4c52ccc3-761c-444e-a029-dfcb96a645c6'}, properties={})

In [470]:
model_datasets = model.datasets
input_dataset = model_datasets['featurized data'][0]

In [471]:
input_dataset.tags['dtypes']

"{'PassengerId': 'int64', 'Survived': 'int64', 'Pclass': 'int64', 'Sex': 'int64', 'Age': 'float64', 'SibSp': 'int64', 'Parch': 'int64', 'Fare': 'float64', 'id': 'object'}"

In [None]:
pdf = input_dataset.to_pandas_dataframe()

In [None]:
Dataset.get_by_name