# Get workspace

In [1]:
from azureml.core import Workspace
ws = Workspace.from_config()
ws

Workspace.create(name='huytung-ws', subscription_id='1f5e877f-48a6-4973-aa3b-ceead3340ce7', resource_group='huytung-rsgroup')

In [2]:
ws.datasets 

{'diabetes file dataset': DatasetRegistration(id='97062e92-e997-4a81-a1e0-81e3adb5af20', name='diabetes file dataset', version=1, description='diabetes files', tags={'format': 'CSV'}), 'diabetes dataset': DatasetRegistration(id='d42f957d-2101-477d-906e-92a5c63fab7e', name='diabetes dataset', version=2, description='diabetes data', tags={'format': 'CSV'})}

# Get default datastore and upload data to datastore

In [3]:
from azureml.core import Dataset
default_ds = ws.get_default_datastore()
default_ds.upload(src_dir='../../data',
                        target_path='diabetes-datas',
                        overwrite=True,
                        show_progress=True)

Uploading an estimated of 2 files
Uploading ../../data\diabetes2.csv
Uploaded ../../data\diabetes2.csv, 1 files out of an estimated total of 2
Uploading ../../data\diabetes.csv
Uploaded ../../data\diabetes.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_2b54bb3e234643b19d662314f257713d

# Create and register Datasets

In [4]:
tablular_dataset = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-datas/*.csv'))
tablular_dataset = tablular_dataset.register(workspace=ws, 
                                name='diabetes dataset',
                                description='diabetes data',
                                tags = {'format':'CSV'},
                                create_new_version=True)

# Create environment for pipeline

In [11]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
# Create a Python environment for the experiment
diabetes_env = Environment("diabetes-pipeline-env")
diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
diabetes_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies
diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','ipykernel','matplotlib','pandas','pip'],
                                             pip_packages=['azureml-defaults','azureml-dataprep[pandas]','pyarrow'])

# Add the dependencies to the environment
diabetes_env.python.conda_dependencies = diabetes_packages

# Register the environment 
diabetes_env.register(workspace=ws)
env = Environment.get(ws, 'diabetes-pipeline-env')


'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


In [12]:
from azureml.core import ComputeTarget
pipeline_compute = ComputeTarget(workspace=ws, name='huytungcomputename')
pipeline_compute

Name,Workspace,State,Location,VmSize,Application URI,Docs
huytungcomputename,huytung-ws,Running,eastus2,STANDARD_DS11_V2,Jupyter JupyterLab RStudio,Doc


In [13]:
from azureml.core.runconfig import RunConfiguration
# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()
# Use the compute you created above. 
pipeline_run_config.target = pipeline_compute
# Assign the environment to the run configuration
pipeline_run_config.environment = env

# Create Pipeline

In [14]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
# diabetes_ds = ws.datasets.get("diabetes dataset")
diabetes_ds = Dataset.get_by_name(workspace=ws, name='diabetes dataset')

# Create a PipelineData (temporary Data Reference) for the model folder
prepped_data_folder = PipelineData("prepped_data_folder", datastore=ws.get_default_datastore())

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = './',
                                script_name = "prep_diabetes.py",
                                arguments = ['--input-data', diabetes_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data_folder],
                                outputs=[prepped_data_folder],
                                compute_target = pipeline_compute,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = './',
                                script_name = "train_diabetes.py",
                                arguments = ['--training-folder', prepped_data_folder],
                                inputs=[prepped_data_folder],
                                compute_target = pipeline_compute,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [15]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

# Construct the pipeline
pipeline_steps = [prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'mslearn-diabetes-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print(pipeline_run.get_portal_url)
print("Pipeline submitted for execution.")
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Prepare Data [deadee62][a458ae20-dc88-4b5b-b37e-cd48499436f4], (This step will run and generate new outputs)
Created step Train and Register Model [a7bd56d7][2ba72e0b-740b-42ed-bf38-38ad85863e5a], (This step will run and generate new outputs)
Submitted PipelineRun 8852d7f7-c093-4d73-9457-b4cb04dafc70
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8852d7f7-c093-4d73-9457-b4cb04dafc70?wsid=/subscriptions/1f5e877f-48a6-4973-aa3b-ceead3340ce7/resourcegroups/huytung-rsgroup/workspaces/huytung-ws&tid=1fd983f3-dc44-42f8-ad66-7972e9d94659
<bound method HasRunPortal.get_portal_url of Run(Experiment: mslearn-diabetes-pipeline,
Id: 8852d7f7-c093-4d73-9457-b4cb04dafc70,
Type: azureml.PipelineRun,
Status: Preparing)>
Pipeline submitted for execution.
PipelineRunId: 8852d7f7-c093-4d73-9457-b4cb04dafc70
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8852d7f7-c093-4d73-9457-b4cb04dafc70?wsid=/subscriptions/1f5e877f-48a6-4973-aa3b-ceea

'Finished'

# Get results

In [16]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

Train and Register Model :
	 Accuracy : 0.8984444444444445
	 AUC : 0.8834267090544071
	 ROC : aml://artifactId/ExperimentRun/dcid.9bb67af3-5a2f-4add-a44e-f0adb7d60462/ROC_1619456317.png
Prepare Data :
	 raw_rows : 15000
	 processed_rows : 15000


In [17]:
from azureml.core import Model

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

diabetes_model version: 2
	 Training context : Pipeline
	 AUC : 0.8834267090544071
	 Accuracy : 0.8984444444444445


diabetes_model version: 1
	 Training context : Script
	 AUC : 0.8484437036668493
	 Accuracy : 0.774




In [18]:
# Publish the pipeline from the run
published_pipeline = pipeline_run.publish_pipeline(
    name="diabetes-training-pipeline", description="Trains diabetes model", version="1.0")

published_pipeline

Name,Id,Status,Endpoint
diabetes-training-pipeline,e4dfe6e8-e005-4b2f-90f8-c5b247df16c0,Active,REST Endpoint


In [19]:
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)

https://eastus2.api.azureml.ms/pipelines/v1.0/subscriptions/1f5e877f-48a6-4973-aa3b-ceead3340ce7/resourceGroups/huytung-rsgroup/providers/Microsoft.MachineLearningServices/workspaces/huytung-ws/PipelineRuns/PipelineSubmit/e4dfe6e8-e005-4b2f-90f8-c5b247df16c0


In [20]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print("Authentication header ready.")

Authentication header ready.


In [22]:
import requests

experiment_name = 'mslearn-diabetes-pipeline'

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": experiment_name})
run_id = response.json()
run_id

{'Description': None,
 'Status': {'StatusCode': 0,
  'StatusDetail': None,
  'CreationTime': '2021-04-26T17:06:19.4286714Z',
  'EndTime': None},
 'GraphId': 'a15ae2a5-c7c1-438c-875c-c774dc914a0d',
 'IsSubmitted': False,
 'HasErrors': False,
 'UploadState': 0,
 'ParameterAssignments': {},
 'DataPathAssignments': {},
 'DataSetDefinitionValueAssignments': {},
 'RunHistoryExperimentName': 'mslearn-diabetes-pipeline',
 'PipelineId': 'e4dfe6e8-e005-4b2f-90f8-c5b247df16c0',
 'RunSource': 'Unavailable',
 'RunType': 0,
 'TotalRunSteps': 2,
 'ScheduleId': None,
 'RunUrl': 'https://ml.azure.com/experiments/mslearn-diabetes-pipeline/runs/ff9e9ee4-1eec-426d-abdb-81eab3b71deb?tid=1fd983f3-dc44-42f8-ad66-7972e9d94659&wsid=/subscriptions/1f5e877f-48a6-4973-aa3b-ceead3340ce7/resourcegroups/huytung-rsgroup/workspaces/huytung-ws',
 'tags': {},
 'StepTags': {},
 'Properties': {},
 'StepProperties': {},
 'CreatedBy': {'UserObjectId': 'beceb2c0-e78f-4733-929f-a3a5cd7b1f46',
  'UserTenantId': '1fd983f3-dc44-