# Automated ML

Needed dependencies imported here.

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources
import xgboost

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.core.environment import Environment 
from azureml.core.model import InferenceConfig 
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.model import Model
from azureml.pipeline.steps import AutoMLStep
from azureml.widgets import RunDetails
import joblib


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.22.0


## Create an Azure ML experiment

An Experiments named 'titanic-survival' and a folder to hold the training scripts are created. The script runs will be recorded under the experiment in Azure.

## Dataset

Dataset used for this project is "Titanic - Machine Learning from Disaster" dataset from [Kaggle](https://www.kaggle.com/c/titanic). Aim here is to predicts which passengers survived the Titanic shipwreck.

Dataset features:

| Variable | Definition | Key |
|----------|------------|-----|
|survival|Survival|0 = No, 1 = Yes|
|pclass|Ticket class|1 = 1st, 2 = 2nd, 3 = 3rd|
|sex|Sex| |	
|Age|Age in years| |	
|sibsp|# of siblings / spouses aboard the Titanic| |
|parch|# of parents / children aboard the Titanic| |	
|ticket|Ticket number| |	
|cabin|Cabin number| |	
|fare|Passenger fare| |	
|embarked|Port of Embarkation|C = Cherbourg, Q = Queenstown, S = Southampton|

In [2]:
ws = Workspace.from_config()
experiment_name = 'titanic-survival'
project_folder = './capstone-project'

print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')
experiment=Experiment(ws, experiment_name)

key = "titanic-survival-data"
description_text = "Kaggle dataset for Titanic disaster."

if key in ws.datasets.keys(): 
        dataset = ws.datasets[key] 

else:
        # Create AutoML Dataset and register it into Workspace
        my_dataset = 'https://raw.githubusercontent.com/j0h4nnesk/Capstone_project_Titanic_Survival/main/train.csv'
        dataset = Dataset.Tabular.from_delimited_files(my_dataset)        
        # Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


run = experiment.start_logging()

capstone-project
capstone-project
francecentral
2d4b3a3e-de2a-45bb-9ac0-29caf8f98da4


In [3]:
# Name for the cluster
cpu_cluster_name = "compute-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    print('Creating a new compute cluster...')
    # Poll for a minimum number of nodes (min_nodes = 1). 
    # If no min node count is provided it uses the scale settings for the cluster.
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS3_v2', min_nodes=1, max_nodes=6)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Creating a new compute cluster...
Creating
Succeeded.....................
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 1, 'targetNodeCount': 1, 'nodeStateCounts': {'preparingNodeCount': 1, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-03-09T05:12:24.809000+00:00', 'errors': None, 'creationTime': '2021-03-09T05:10:25.996387+00:00', 'modifiedTime': '2021-03-09T05:10:41.503302+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 1, 'maxNodeCount': 6, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS3_V2'}


## AutoML Configuration

Here is an overview of AutoML settings and configurations:

##### AutoML settings
*n_cross_validations=2:* How many cross validations to perform when user validation data is not specified.

*primary_metric='accuracy':* The metric that Automated Machine Learning will optimize for model selection.

*enable_early_stopping=True* Whether to enable early termination if the score is not improving in the short term. 

*max_concurrent_iterations=4:* Represents the maximum number of iterations that would be executed in parallel.

*experiment_timeout_minutes=60:* Exit criteria that is used to define how long, in minutes, the experiment should continue to run. To help avoid experiment time out failures, 30 minutes was used as the timeout value.

*verbosity=logging.INFO:* The verbosity level for writing to the log file.

##### AutoML config

*compute_target=compute_target:* The compute target to run the Automated Machine Learning experiment on.

*task='classification':* The type of task to run.

*blocked_models=['XGBoostClassifier']*: Blocking XGBoostClassifier algorithm to prevent AttributeError (/anaconda/envs/azureml_py36/lib/libxgboost.so: undefined symbol: XGBoosterUnserializeFromBuffer)

*training_data=dataset:* The training data to be used within the experiment. It should contain both training features and a label column.

*label_column_name='Survived':* The name of the label column, the target column based on which the prediction is done.

*path = project_folder:* The full path to the Azure Machine Learning project folder.

*featurization= 'auto':* 'auto' / 'off' / FeaturizationConfig Indicator for whether featurization step should be done automatically or not.

*debug_log = "automl_errors.log":* The log file to write debug information to. 

*enable_onnx_compatible_models=False:* Whether to enable or disable enforcing the ONNX-compatible models. (([ONNX](https://docs.microsoft.com/en-us/azure/machine-learning/concept-onnx)) can help optimize the inference of your machine learning model. Inference, or model scoring, is the phase where the deployed model is used for prediction, most commonly on production data.)

In [4]:

# Automl settings
automl_settings = {"n_cross_validations": 2,
                    "primary_metric": 'accuracy',
                    "enable_early_stopping": True,
                    "max_concurrent_iterations": 4,
                    "experiment_timeout_minutes": 60,
                    "verbosity": logging.INFO
                    }

# Parameters for AutoMLConfig
automl_config = AutoMLConfig(compute_target = compute_target,
                            task='classification',
                            blocked_models=['XGBoostClassifier'],
                            training_data=dataset,
                            label_column_name='Survived',
                            path = project_folder,
                            featurization= 'auto',
                            debug_log = "automl_errors.log",
                            enable_onnx_compatible_models=False,
                            **automl_settings
                            )

In [5]:
remote_run = experiment.submit(automl_config, show_output = True)
remote_run.wait_for_completion()

Running on remote.
No run_configuration provided, running on compute-cluster with default configuration
Running on remote compute: compute-cluster
Parent Run ID: AutoML_be16ab39-a919-4f10-84ed-30af3e4fa913

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       DONE
DESCRIPTION:  If the missing values are expected, let the run complete. Otherwise cancel the current run and use a script to cu

{'runId': 'AutoML_be16ab39-a919-4f10-84ed-30af3e4fa913',
 'target': 'compute-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-03-09T05:12:51.394879Z',
 'endTimeUtc': '2021-03-09T06:59:18.15718Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '2',
  'target': 'compute-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"e5b70af0-9733-45c6-b513-070b9f2dca96\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"isArchive\\\\\\": false, \\\\\\"path\\\\\\": {\\\\\\"target\\\\\\": 4, \\\\\\"resourceDetails\\\\\\": [{\\\\\\"path\\\\\\": \\\\\\"https://raw.githubusercontent.com/j0h4nnesk/Capstone_project_Titanic_Survival/main/train.csv\\\\\\"}]}}, \\\\\\"localData\\\\\\": {}, \\\\\\"isEnabled\\\\\\": true, \\\

## Run Details

Here `RunDetails` widget is used to show children runs of the experiment.

In [6]:
RunDetails(remote_run).show()

# Get details from each run
for child_run in remote_run.get_children():
    print('**************************************************')
    print(child_run)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

**************************************************
Run(Experiment: titanic-survival,
Id: AutoML_be16ab39-a919-4f10-84ed-30af3e4fa913_57,
Type: azureml.scriptrun,
Status: Completed)
**************************************************
Run(Experiment: titanic-survival,
Id: AutoML_be16ab39-a919-4f10-84ed-30af3e4fa913_56,
Type: azureml.scriptrun,
Status: Completed)
**************************************************
Run(Experiment: titanic-survival,
Id: AutoML_be16ab39-a919-4f10-84ed-30af3e4fa913_52,
Type: azureml.scriptrun,
Status: Completed)
**************************************************
Run(Experiment: titanic-survival,
Id: AutoML_be16ab39-a919-4f10-84ed-30af3e4fa913_55,
Type: azureml.scriptrun,
Status: Canceled)
**************************************************
Run(Experiment: titanic-survival,
Id: AutoML_be16ab39-a919-4f10-84ed-30af3e4fa913_51,
Type: azureml.scriptrun,
Status: Completed)
**************************************************
Run(Experiment: titanic-survival,
Id: AutoML_

## Best Model

The best model is fetched from the automl experiments and all the properties of the model are displayed.



In [7]:
best_run, fitted_model = remote_run.get_output()

# get_metrics(): returns the metrics
print("Best run metrics :",best_run.get_metrics())
print('**************************************************')

# get_details(): returns a dictionary with the details for the run
print("Best run details :",best_run.get_details())
print('**************************************************')

# get_properties(): fetches the latest properties of the run from the service
print("Best run properties :",best_run.get_properties())
print('**************************************************')

Package:azureml-automl-runtime, training version:1.23.0, current version:1.22.0
Package:azureml-core, training version:1.23.0, current version:1.22.0
Package:azureml-dataprep, training version:2.10.1, current version:2.9.1
Package:azureml-dataprep-native, training version:30.0.0, current version:29.0.0
Package:azureml-dataprep-rslex, training version:1.8.0, current version:1.7.0
Package:azureml-dataset-runtime, training version:1.23.0, current version:1.22.0
Package:azureml-defaults, training version:1.23.0, current version:1.22.0
Package:azureml-interpret, training version:1.23.0, current version:1.22.0
Package:azureml-mlflow, training version:1.23.0, current version:1.22.0
Package:azureml-pipeline-core, training version:1.23.0, current version:1.22.0
Package:azureml-telemetry, training version:1.23.0, current version:1.22.0
Package:azureml-train-automl-client, training version:1.23.0, current version:1.22.0
Package:azureml-train-automl-runtime, training version:1.23.0, current versio

Best run metrics : {'norm_macro_recall': 0.6081211850424402, 'balanced_accuracy': 0.8040605925212201, 'precision_score_macro': 0.8163313179913287, 'AUC_macro': 0.8575795397361312, 'precision_score_weighted': 0.8226500440232183, 'AUC_micro': 0.869405980033039, 'average_precision_score_weighted': 0.8541176504791648, 'recall_score_weighted': 0.8215397793117347, 'accuracy': 0.8215397793117347, 'matthews_correlation': 0.6201458540816633, 'recall_score_micro': 0.8215397793117347, 'recall_score_macro': 0.8040605925212201, 'f1_score_weighted': 0.8196814588041497, 'log_loss': 0.48073773272672893, 'f1_score_macro': 0.8075850728873257, 'average_precision_score_macro': 0.8494177540532644, 'weighted_accuracy': 0.8372542428022359, 'precision_score_micro': 0.8215397793117347, 'average_precision_score_micro': 0.8470509616921699, 'f1_score_micro': 0.8215397793117347, 'AUC_weighted': 0.8575795397361313, 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_be16ab39-a919-4f10-84ed-30af3e4fa913_56

In [8]:
print(fitted_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                  l1_ratio=0.7959183673469387,
                                                                                                  learning_rate='invscaling',
                                                                                                  loss='modified_huber',
                  

In [9]:
best_run.get_file_names()

# Download the yaml file that includes the environment dependencies
best_run.download_file('outputs/conda_env_v_1_0_0.yml', 'env.yml')
# Download the model file

best_run.download_file('outputs/model.pkl', 'Automl_model.pkl')

In [10]:
best_run


Experiment,Id,Type,Status,Details Page,Docs Page
titanic-survival,AutoML_be16ab39-a919-4f10-84ed-30af3e4fa913_56,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [11]:
best_run.register_model(model_name = "best_run_automl.pkl", model_path = './outputs/')

print(best_run)

Run(Experiment: titanic-survival,
Id: AutoML_be16ab39-a919-4f10-84ed-30af3e4fa913_56,
Type: azureml.scriptrun,
Status: Completed)


## Model Deployment

As the best model from AutoML run has better accuracy than the one coming from the HyperDrive run, the AutoML model is deployed. Model is registered, deployed as a webservice and ingerence config is created.


In [13]:
# https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.webservice.aci.aciwebservice?view=azure-ml-py

model = remote_run.register_model(model_name = 'best_run_automl.pkl')
print(remote_run.model_id)


environment = best_run.get_environment()
entry_script='inference/scoring.py'
best_run.download_file('outputs/scoring_file_v_1_0_0.py', entry_script)

service_name = 'titanic-service'

inference_config = InferenceConfig(entry_script = entry_script, environment = environment)

aci_config = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                                    memory_gb = 1, 
                                                    auth_enabled= True, 
                                                    enable_app_insights= True)

service = Model.deploy(workspace = ws, name = service_name, models = [model], inference_config = inference_config, deployment_config = aci_config)
service.wait_for_deployment(show_output = True)

best_run_automl.pkl
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running.....................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"


Request is sent to the webservice to test it.

In [14]:
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

data = {
    "data":
    [
        {
            'PassengerId': "892",
            'Pclass': "3",
            'Name': "Kelly, Mr. James",
            'Sex': "male",
            'Age': "34.5",
            'SibSp': "0",
            'Parch': "0",
            'Ticket': "330911",
            'Fare': "7.892",
            'Cabin': "",
            'Embarked': "Q",
        },
    ],
}

body = str.encode(json.dumps(data))

url = 'http://9c9c1d52-316e-45ed-b811-0bfb04b6a762.francecentral.azurecontainer.io/score'
api_key = 'zFb4t4JfleLdPkueQ03lWKC9Blu741FW' # Replace this with the API key for the web service
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(json.loads(error.read().decode("utf8", 'ignore')))

b'"{\\"result\\": [0]}"'


In [15]:
# Printing the logs
print(service.get_logs())

2021-03-09T07:09:21,815232800+00:00 - iot-server/run 
2021-03-09T07:09:21,816601700+00:00 - gunicorn/run 
2021-03-09T07:09:21,822278700+00:00 - rsyslog/run 
2021-03-09T07:09:21,858716200+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_661474bbe74e96b5d8added5888dfc85/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_661474bbe74e96b5d8added5888dfc85/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_661474bbe74e96b5d8added5888dfc85/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_661474bbe74e96b5d8added5888dfc85/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_661474bbe74e96b5d8added5888dfc85/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

In [16]:
# delete service 
service.delete()