In [None]:
import os
import azureml.core
from azureml.core.runconfig import JarLibrary
from azureml.core.compute import ComputeTarget, DatabricksCompute
from azureml.exceptions import ComputeTargetException
from azureml.core import Workspace, Experiment
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import DatabricksStep
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

In [None]:
subscription_id = '8b2f4e94-e7b0-42e5-b775-dd2d5968c4e6'
resource_group  = 'HelenMachineLearning'
workspace_name  = 'HelenMachineLearning'

In [None]:
ws = Workspace.get(
    name=workspace_name,
    subscription_id=subscription_id,
    resource_group=resource_group)

In [None]:
# Replace with your account info before running.
 
db_compute_name=os.getenv("DATABRICKS_COMPUTE_NAME", "databricks") # Databricks compute name
db_resource_group=os.getenv("DATABRICKS_RESOURCE_GROUP", "Databricks2019") # Databricks resource group
db_workspace_name=os.getenv("DATABRICKS_WORKSPACE_NAME", "Databricks2019") # Databricks workspace name
db_access_token=os.getenv("DATABRICKS_ACCESS_TOKEN", "dapi820ce3b8cd06d9330c863c63") # Databricks access token
 
try:
    databricks_compute = DatabricksCompute(workspace=ws, name=db_compute_name)
    print('Compute target {} already exists'.format(db_compute_name))
except ComputeTargetException:
    print('Compute not found, will use below parameters to attach new one')
    print('db_compute_name {}'.format(db_compute_name))
    print('db_resource_group {}'.format(db_resource_group))
    print('db_workspace_name {}'.format(db_workspace_name))
    print('db_access_token {}'.format(db_access_token))
 
    config = DatabricksCompute.attach_configuration(
        resource_group = db_resource_group,
        workspace_name = db_workspace_name,
        access_token= db_access_token)
    databricks_compute=ComputeTarget.attach(ws, db_compute_name, config)
    databricks_compute.wait_for_completion(True)

# One time setup: Install databricks CLI and configure

In [5]:
!pip install databricks-cli



twisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.
msftkube 1.0.1598662 has requirement applicationinsights==0.11.5, but you'll have applicationinsights 0.11.9 which is incompatible.
msftkube 1.0.1598662 has requirement urllib3==1.22, but you'll have urllib3 1.23 which is incompatible.
azureml-sdk 0.1.0.1060109 has requirement azureml-core==0.1.0.1060109, but you'll have azureml-core 0.1.0.0 which is incompatible.
azureml-sdk 0.1.0.1060109 has requirement azureml-train==0.1.0.1060109, but you'll have azureml-train 0.1.0.0 which is incompatible.
azureml-requirements 0.1.0.888002 has requirement azureml-core==0.1.0.888002, but you'll have azureml-core 0.1.0.0 which is incompatible.
azureml-requirements 0.1.0.888002 has requirement azureml-train==0.1.0.888002, but you'll have azureml-train 0.1.0.0 which is incompatible.
azureml-contrib-widgets 0.1.0.1060109 has requirement azureml-core==0.1.0.1060109, but you'll have azureml-core 0.1.0.0 which is incompatible.
azureml-cont

# Configure CLI

> Run `dbfs configure --token` at command line to setup authentication. You'll need to specify Databricks URL and Personal Access Token. This link has details: https://docs.databricks.com/dev-tools/cli/index.html

# Upload files from local computer to DBFS

In [None]:
local_path = ".\code"
dbfs_path = "dbfs:/data/UploadTest"

In [None]:
# Run this if you need to delete existing DBFS folder
# !dbfs rm -r {dbfs_path}

In [8]:
!dbfs cp -r {local_path} {dbfs_path}

.\code\hello.py -> dbfs:/data/UploadTest/hello.py


# Use the file that was uploaded to DBFS in DatabricksStep

In [None]:
python_script_path = dbfs_path + "/hello.py"  # append name of the entry script
print(python_script_path)

In [12]:
dbPythonInDbfsStep = DatabricksStep(
    name="DBPythonInDBFS",
    run_name='DB_Python_demo',
    compute_target=databricks_compute,
    python_script_path=python_script_path,
    python_script_params={'arg1', 'arg2'},
    num_workers=2,
    allow_reuse=False)

In [None]:
steps = [dbPythonInDbfsStep]
pipeline = Pipeline(workspace=ws, steps=steps)
pipeline_run = Experiment(ws, 'DB_Python_demo').submit(pipeline)
pipeline_run.wait_for_completion()

Created step DBPythonInDBFS [d46aac4b][ed63aea8-fe76-4e4b-a556-95a4e17e1506], (This step will run and generate new outputs)
Submitted PipelineRun c1b4d9f2-0e26-4e1a-8972-07721acc76d2
Link to Azure Machine Learning studio: https://ml.azure.com/experiments/DB_Python_demo/runs/c1b4d9f2-0e26-4e1a-8972-07721acc76d2?wsid=/subscriptions/8b2f4e94-e7b0-42e5-b775-dd2d5968c4e6/resourcegroups/HelenMachineLearning/workspaces/HelenMachineLearning
PipelineRunId: c1b4d9f2-0e26-4e1a-8972-07721acc76d2
Link to Portal: https://ml.azure.com/experiments/DB_Python_demo/runs/c1b4d9f2-0e26-4e1a-8972-07721acc76d2?wsid=/subscriptions/8b2f4e94-e7b0-42e5-b775-dd2d5968c4e6/resourcegroups/HelenMachineLearning/workspaces/HelenMachineLearning
PipelineRun Status: Running


StepRunId: ad632b88-ef78-4ba6-92ac-11aee0b27d41
Link to Portal: https://ml.azure.com/experiments/DB_Python_demo/runs/ad632b88-ef78-4ba6-92ac-11aee0b27d41?wsid=/subscriptions/8b2f4e94-e7b0-42e5-b775-dd2d5968c4e6/resourcegroups/HelenMachineLearning/wor