# Automated ML

Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import os
import logging
import csv

import azureml.core
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core import Environment
from azureml.core import ScriptRunConfig

from azureml.core import Datastore
from azureml.core.dataset import Dataset

from azureml.train.automl import AutoMLConfig

from azureml.widgets import RunDetails

In [2]:
print(azureml.core.VERSION)

1.33.0


create an instance of Workspace and get an an Experiment instance.

In [3]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'udacity-capstone-project'
project_folder = "automl-heartfailure"
experiments = Experiment.list(ws, experiment_name=experiment_name)

if not experiments:
    experiment = Experiment(workspace=ws, name=experiment_name)
else:
    experiment = experiments[0]

print('Workspace name: ' + ws.name, 
    'Azure region: ' + ws.location, 
    'Subscription id: ' + ws.subscription_id, 
    'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: quick-starts-ws-157284
Azure region: southcentralus
Subscription id: a24a24d5-8d87-4c8a-99b6-91ed2d2df51f
Resource group: aml-quickstarts-157284


## Dataset

### Overview
Cardiovascular diseases (CVDs) are the number 1 cause of death globally. 
CVDs commonly causes heart failures. 
Early detection of heart failure is one way of addressing the problem. 
Here we use machine learning approach to build a classification model relying on a Heart Failure prediction dataset. 
This dataset is available in Kaggle. 
The dataset consists of 12 features that are cardiovascular disease, hypertension, diabetes and so on.

The goal is to build a binary classification model that predict heart failure.


In [4]:
import pandas as pd

def clean_data(data):
    normalized_column_names = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium','time']

    x_df = data.to_pandas_dataframe().dropna()

    min_values = x_df[normalized_column_names].min(axis=0)
    max_values = x_df[normalized_column_names].max(axis=0)

    for column_name in normalized_column_names:
        m0 = min_values[column_name]
        m1 = max_values[column_name]

        x_df[column_name] = x_df[column_name].apply(lambda x : (x - m0)/(m1 - m0))

    category_column_names = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']

    for column_name in category_column_names:
        tmp = pd.get_dummies(x_df[column_name], prefix=column_name[:3])
        x_df.drop(column_name, inplace=True, axis=1)
        x_df = x_df.join(tmp)

    y_df = x_df.pop('DEATH_EVENT')

    return x_df, y_df

In [5]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create a Dataset instance
found = False
key = "HeartFailurePrediction"
description_text = "Heart Failure Prediction DataSet for Udacity Capstone Project"

if key in ws.datasets.keys(): 
    found = True
    dataset_tmp = ws.datasets[key] 
else:
    print("Register heart_failure_clinical_records_dataset.csv into Workspace")


In [6]:
# get defaut_datastore
datastore = ws.get_default_datastore()

try:
    ds_prepared = TabularDatasetFactory.from_delimited_files(datastore.path("data/heartfailure_prepared.csv"))
except:
    print("heartfailure_prepared.csv is not available")
    ds_prepared = None

if not ds_prepared:
    # Use the clean_data function to clean your data.
    x, y = clean_data(dataset_tmp) 

    x["DEATH_EVENT"] = y

    os.makedirs(os.path.join(".", "data"), exist_ok=True)
    x.to_csv(os.path.join(".", "data", "heartfailure_prepared.csv"), index=False)

    datastore.upload(os.path.join(".", "data"), target_path="data")

    ds_prepared = TabularDatasetFactory.from_delimited_files(datastore.path("data/heartfailure_prepared.csv"))

heartfailure_prepared.csv is not available
Uploading an estimated of 1 files
Uploading ./data/heartfailure_prepared.csv
Uploaded ./data/heartfailure_prepared.csv, 1 files out of an estimated total of 1
Uploaded 1 files


### create compute targets

In [7]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Create compute cluster
cpu_cluster_name = "cpu-cluster-01vx"
vm_size = "Standard_DS3_v2"
min_nodes = 0
max_nodes = 6


try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print("Found existing cluster. use it")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                           min_nodes=min_nodes,
                                                           max_nodes=max_nodes)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# a detailed status for the current cluster.
print(compute_target.get_status().serialize())

InProgress......
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Resizing', 'allocationStateTransitionTime': '2021-09-08T01:44:43.578000+00:00', 'errors': None, 'creationTime': '2021-09-08T01:44:43.209248+00:00', 'modifiedTime': '2021-09-08T01:45:08.783482+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 6, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS3_V2'}


## AutoML Configuration

Our task is to build a binary classification model. 
The model's performance was measured with the accuracy.
To reduce the overfitting of the model, cross validations was used. 
To save the model, enable_onnx_compatible_models was set to be True. 
To speed up training, concurrent computation is used.

In [8]:
# Put your automl settings here
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": max_nodes-1,
    "primary_metric" : 'accuracy',
    "n_cross_validations" : 5,
    "enable_onnx_compatible_models" : True
}

# Put your automl config here

automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=ds_prepared,
                             label_column_name="DEATH_EVENT",
                             path = project_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [9]:
# Submit your experiment
automl_run = experiment.submit(automl_config)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-capstone-project,AutoML_3416e583-4040-4d54-a228-95c0f2332f2b,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


## Run Details

In the cell below, use the `RunDetails` widget to show the different experiments.

In [10]:
RunDetails(automl_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [11]:
automl_run.wait_for_completion(show_output=True)

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-capstone-project,AutoML_3416e583-4040-4d54-a228-95c0f2332f2b,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

******************************************************************

{'runId': 'AutoML_3416e583-4040-4d54-a228-95c0f2332f2b',
 'target': 'cpu-cluster-01vx',
 'status': 'Completed',
 'startTimeUtc': '2021-09-08T01:45:34.70118Z',
 'endTimeUtc': '2021-09-08T02:09:39.710986Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'cpu-cluster-01vx',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"cac1ae32-30a6-4bbd-b312-bfc62f28d88c\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.33.0", "azureml-train": "1.33.0", "azureml-train-restclients-hyperdrive": "1.33.0", "azureml-train-core": "1.33.0", "azureml-train-automl": "1.33.0", "azureml-train-automl-runtime": "1.33.0", "azureml-train-automl-client": "1.33

## Best Model

In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [12]:
best_run, best_model = automl_run.get_output()

In [19]:
print(best_model.steps)

[('datatransformer', DataTransformer(
    task='classification',
    is_onnx_compatible=True,
    enable_feature_sweeping=False,
    enable_dnn=False,
    force_text_dnn=False,
    feature_sweeping_timeout=86400,
    featurization_config=None,
    is_cross_validation=True,
    feature_sweeping_config={}
)), ('prefittedsoftvotingclassifier', PreFittedSoftVotingClassifier(
    estimators=[('39', Pipeline(
        memory=None,
        steps=[('standardscalerwrapper', StandardScalerWrapper(
            copy=True,
            with_mean=True,
            with_std=True
        )), ('randomforestclassifier', RandomForestClassifier(
            bootstrap=False,
            ccp_alpha=0.0,
            class_weight=None,
            criterion='gini',
            max_depth=None,
            max_features=0.2,
            max_leaf_nodes=None,
            max_samples=None,
            min_impurity_decrease=0.0,
            min_impurity_split=None,
            min_samples_leaf=0.01,
            min_sam

In [23]:
print(best_model.steps[-1][-1])

PreFittedSoftVotingClassifier(
    estimators=[('39', Pipeline(
        memory=None,
        steps=[('standardscalerwrapper', StandardScalerWrapper(
            copy=True,
            with_mean=True,
            with_std=True
        )), ('randomforestclassifier', RandomForestClassifier(
            bootstrap=False,
            ccp_alpha=0.0,
            class_weight=None,
            criterion='gini',
            max_depth=None,
            max_features=0.2,
            max_leaf_nodes=None,
            max_samples=None,
            min_impurity_decrease=0.0,
            min_impurity_split=None,
            min_samples_leaf=0.01,
            min_samples_split=0.2442105263157895,
            min_weight_fraction_leaf=0.0,
            n_estimators=400,
            n_jobs=1,
            oob_score=False,
            random_state=None,
            verbose=0,
            warm_start=False
        ))],
        verbose=False
    )), ('44', Pipeline(
        memory=None,
        steps=[('sparseno

In [27]:
print(len(best_model.steps[-1][-1].estimators))

7


In [26]:
print(best_model.steps[-1][-1].weights)

[0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285]


In [12]:
best_run, best_model = automl_run.get_output(return_onnx_model=True)

best_model  #best_model.steps

ir_version: 7
producer_name: "AutoML"
producer_version: "1.33.0"
domain: "ai.onnx"
model_version: 0
doc_string: "{\"AutoMLSDKVer\": \"1.33.0\", \"ExperimentName\": \"udacity-capstone-project\", \"RunId\": \"AutoML_480c26e0-e1fe-4752-8861-63cc6692af37_58\", \"PipeId\": \"__AutoML_Ensemble__\"}"
graph {
  node {
    input: "age"
    output: "variable_c0_t0"
    name: "Imputer"
    op_type: "Imputer"
    attribute {
      name: "imputed_value_floats"
      floats: 0.37879806756973267
      type: FLOATS
    }
    attribute {
      name: "replaced_value_float"
      f: nan
      type: FLOAT
    }
    domain: "ai.onnx.ml"
  }
  node {
    input: "creatinine_phosphokinase"
    output: "variable_c1_t0"
    name: "Imputer1"
    op_type: "Imputer"
    attribute {
      name: "imputed_value_floats"
      floats: 0.07129873335361481
      type: FLOATS
    }
    attribute {
      name: "replaced_value_float"
      f: nan
      type: FLOAT
    }
    domain: "ai.onnx.ml"
  }
  node {
    input: "ejec

In [13]:
#Save the best model
from azureml.automl.runtime.onnx_convert import OnnxConverter

OnnxConverter.save_onnx_model(best_model, file_path="./automl_model.onnx")

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

In the cell below, register the model, create an inference config and deploy the model as a web service.

In [28]:
# Register the model to deploy
model = automl_run.register_model(
    model_name=key, 
    description="Binary classification model for Heart Failure prediction" 
)

print(model.id)

HeartFailurePrediction:1


### Local Testing

In [29]:
local_env = best_run.get_environment()

In [37]:
env_directory = "env_automl"
if not os.path.isdir(env_directory):
    os.mkdir(env_directory)
local_env.save_to_directory(env_directory, overwrite=True)

In [30]:
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice import LocalWebservice

deployment_config = LocalWebservice.deploy_configuration(port=6789)

dummy_inference_config = InferenceConfig(
    environment=local_env,
    source_directory="./source_dir",
    entry_script="./score.py",
)

local_service = Model.deploy(
    workspace = ws,
    name = key.lower(),
    models = [model],
    inference_config = dummy_inference_config,
    deployment_config = deployment_config,
    overwrite=True,
)

local_service.wait_for_deployment(show_output=True)

Downloading model HeartFailurePrediction:1 to /tmp/azureml_djd2k2w2/HeartFailurePrediction/1
Generating Docker build context.
Package creation Succeeded
Logging into Docker registry viennaglobal.azurecr.io
Logging into Docker registry viennaglobal.azurecr.io
Building Docker image from Dockerfile...
Step 1/5 : FROM viennaglobal.azurecr.io/azureml/azureml_1f628e9a2f9356112aaaf0c294b1f6e5
 ---> 4d864807f148
Step 2/5 : COPY azureml-app /var/azureml-app
 ---> 2fffb5d57503
Step 3/5 : RUN mkdir -p '/var/azureml-app' && echo eyJhY2NvdW50Q29udGV4dCI6eyJzdWJzY3JpcHRpb25JZCI6ImEyNGEyNGQ1LThkODctNGM4YS05OWI2LTkxZWQyZDJkZjUxZiIsInJlc291cmNlR3JvdXBOYW1lIjoiYW1sLXF1aWNrc3RhcnRzLTE1NzI4NCIsImFjY291bnROYW1lIjoicXVpY2stc3RhcnRzLXdzLTE1NzI4NCIsIndvcmtzcGFjZUlkIjoiYzBkODRiZjQtYjJkZC00MTZkLTlkNmQtODViZjg4ZmZkZjk2In0sIm1vZGVscyI6e30sIm1vZGVsc0luZm8iOnt9fQ== | base64 --decode > /var/azureml-app/model_config_map.json
 ---> Running in cbe2d67d2311
 ---> 5981b984d3b7
Step 4/5 : RUN mv '/var/azureml-app/tmpz79py

In [31]:
import requests
import json

local_uri = local_service.scoring_uri
requests.get("http://localhost:6789")
headers = {"Content-Type": "application/json"}
data = {"data":
        [
            {
                "age" : "65", 
                "anaemia" : "0",
                "creatinine_phosphokinase" : "146",
                "diabetes" : "0", 
                "ejection_fraction" : "20",
                "high_blood_pressure" : "0",
                "platelets" : "162000",
                "serum_creatinine" : "1.3", 
                "serum_sodium" : "129",
                "sex" : "1",
                "smoking" : "1",
                "time" : "7"
            },
        ]
        }

input_data = json.dumps(data)
response = requests.post(local_uri, data=input_data, headers=headers)
print(response.json())

{"result": [1]}


### Deployed environment

In [32]:
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice import AciWebservice, Webservice

# Combine scoring script & environment in Inference configuration
inference_config = InferenceConfig(environment=local_env, source_directory='./source_dir', entry_script='./score.py')

# Set deployment configuration
deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1, auth_enabled = True, enable_app_insights=True)

# Define the model, inference, & deployment configuration and web service name and location to deploy
service = Model.deploy(
    workspace = ws,
    name = key.lower(),
    models = [model],
    inference_config = inference_config,
    deployment_config = deployment_config)

service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-09-08 02:58:17+00:00 Creating Container Registry if not exists.
2021-09-08 02:58:17+00:00 Registering the environment.
2021-09-08 02:58:17+00:00 Use the existing image.
2021-09-08 02:58:17+00:00 Generating deployment configuration.
2021-09-08 02:58:18+00:00 Submitting deployment to compute.
2021-09-08 02:58:21+00:00 Checking the status of deployment heartfailureprediction..
2021-09-08 03:03:13+00:00 Checking the status of inference endpoint heartfailureprediction.
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [33]:
primary_key = service.get_keys()[0]

# Check that scoring_uri is available)
print(service.scoring_uri)

http://10a4a7b8-ab86-4a35-8a93-5b213b09c875.southcentralus.azurecontainer.io/score


In the cell below, send a request to the web service you deployed to test it.

In [34]:
import requests

uri = service.scoring_uri

headers = {"Content-Type": "application/json"}
# If authentication is enabled, set the authorization header
headers['Authorization'] = f'Bearer {primary_key}'

data = {"data":
        [
            {
                "age" : "65", 
                "anaemia" : "0",
                "creatinine_phosphokinase" : "146",
                "diabetes" : "0", 
                "ejection_fraction" : "20",
                "high_blood_pressure" : "0",
                "platelets" : "162000",
                "serum_creatinine" : "1.3", 
                "serum_sodium" : "129",
                "sex" : "1",
                "smoking" : "1",
                "time" : "7"
            },
            {
                "age" : "65", 
                "anaemia" : "1",
                "creatinine_phosphokinase" : "52",
                "diabetes" : "0", 
                "ejection_fraction" : "23",
                "high_blood_pressure" : "0",
                "platelets" : "272000",
                "serum_creatinine" : "1.3", 
                "serum_sodium" : "133",
                "sex" : "0",
                "smoking" : "0",
                "time" : "17"
            },

        ]
        }
input_data = json.dumps(data)
response = requests.post(uri, data=input_data, headers=headers)

print(response.json())

{"result": [1, 1]}


In the cell below, print the logs of the web service and delete the service

In [35]:
service.get_logs()

'2021-09-08T03:02:57,142881900+00:00 - iot-server/run \n2021-09-08T03:02:57,145281000+00:00 - gunicorn/run \nDynamic Python package installation is disabled.\nStarting HTTP server\n2021-09-08T03:02:57,150668000+00:00 - rsyslog/run \n2021-09-08T03:02:57,173968000+00:00 - nginx/run \nrsyslogd: /azureml-envs/azureml_77e992fdaf2b83b0c81ee1c90ded8d17/lib/libuuid.so.1: no version information available (required by rsyslogd)\nEdgeHubConnectionString and IOTEDGE_IOTHUBHOSTNAME are not set. Exiting...\n2021-09-08T03:02:57,643812600+00:00 - iot-server/finish 1 0\n2021-09-08T03:02:57,645823300+00:00 - Exit code 1 is normal. Not restarting iot-server.\nStarting gunicorn 20.1.0\nListening at: http://127.0.0.1:31311 (64)\nUsing worker: sync\nworker timeout is set to 300\nBooting worker with pid: 92\nSPARK_HOME not set. Skipping PySpark Initialization.\nGenerating new fontManager, this may take some time...\nInitializing logger\n2021-09-08 03:03:03,580 | root | INFO | Starting up app insights client\

## Clean up resources and services

In [22]:
service.delete()
model.delete()

compute_target.delete()