<center><h1>Automated ML</h1></center>

### Import Dependencies. 

In [6]:
from azureml.core import Workspace, Experiment, Dataset
from azureml.train.automl import AutoMLConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails

from azureml.interpret import ExplanationClient
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core.webservice import AciWebservice
from azureml.core import Webservice

from azureml.automl.runtime.onnx_convert import OnnxConverter

import pandas as pd

import time
import logging
import requests
import json
import sys

## Dataset

This data set was collected with the objective of developing prediction models to classify the probability of canceling a hotel reservation.

I decided to do the cleaning operations on the notebook: hotel-boking-demand-dataset-cleanup.ipynb. In that nootebook I removed the notebook values and unnecessary columns.

After that I added the clean data set through the Azure Machine Learning interface to be able to consume it within my workspace.

In [7]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'HotelBookingDemand'

experiment=Experiment(ws, experiment_name)

In [8]:
cpu_cluster_name = "notebook134555"

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.

Running


In [9]:
dataset = Dataset.get_by_name(ws, name='hotel_bookings_clean')

training_data, validation_data = dataset.random_split(percentage=0.8, seed=1)

## AutoML Configuration

Here I chose as metric for evaluation the accuracy, I put a timeout of 30 minutes in the experiment, enable the early stopping policy, limit the time of each interaction to 15 minutes and I put a maximum of 5 concurrent nodes.

In [10]:
# Automl settings here
automl_settings = {
    "name": "AutoML_HotelBookingDemand_Experiment_{0}".format(time.time()),
    "experiment_timeout_minutes" : 30,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 15,
    "primary_metric_name": 'accuracy',
    "max_concurrent_iterations": 5
}

automl_config = AutoMLConfig(task = 'classification',                                                          
                             training_data=training_data,
                             validation_data= validation_data,
                             label_column_name = 'is_canceled',  
                             enable_onnx_compatible_models=True,
                             compute_target=cpu_cluster,                           
                                **automl_settings
                                )                              

In [11]:
remote_run = experiment.submit(automl_config, show_output=False)



Running on remote.


## Run Details

The best general model was the VotingEnsemble, which is an ensemble machine learning model that combines the predictions of all the previous models, but taking this the best model was the MaxAbsScaler, LightGBM with an accuracy of 87% and a duration of 01:26 seconds of training. , while the worst model was the StandardScalerWrapper, LightGBM with 62% accuracy and a duration of 01:25, which indicates that with a similar training time the MaxAbsScaler, LightGBM achieved an accuracy of 25% better using the same model but a different data processing technique, MaxAbsScaler instead of StandardScalerWrapper.

In [12]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output = False)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

{'runId': 'AutoML_85aac5b0-fc10-43db-b9bc-a1f2f2e5d33c',
 'target': 'notebook134555',
 'status': 'Completed',
 'startTimeUtc': '2021-01-13T14:15:22.358048Z',
 'endTimeUtc': '2021-01-13T14:35:31.214729Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'notebook134555',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"59a0e9c8-2fb0-4dba-bc2d-f6c4027a1802\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/01-13-2021_020236_UTC/hotel_bookings_clean.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-134555\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"cdbe0b43-92a0-4715-

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

## Best Model

In [13]:
# Get your best run and save the model from that run.
best_run, fitted_model = remote_run.get_output(metric = "accuracy")
print(best_run)

description = 'Best AutoML Model'
tags = None

model = remote_run.register_model(description = description, tags = tags)
print(remote_run.model_id)



Run(Experiment: HotelBookingDemand,
Id: AutoML_85aac5b0-fc10-43db-b9bc-a1f2f2e5d33c_38,
Type: azureml.scriptrun,
Status: Completed)
AutoML85aac5b0f38


In [19]:
print(model)

Model(workspace=Workspace.create(name='quick-starts-ws-134555', subscription_id='cdbe0b43-92a0-4715-838a-f2648cc7ad21', resource_group='aml-quickstarts-134555'), name=AutoML85aac5b0f38, id=AutoML85aac5b0f38:1, version=1, tags={}, properties={})


In [14]:
print('fitted_model:\n {}\n\n'.format(fitted_model))

fitted_model:
 Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               n_estimators=100,
                                                                                               n_jobs=1,
                                                                                               nthread=None,
                                                  

In [15]:
featurizer = fitted_model.named_steps['datatransformer']
df_f = featurizer.get_featurization_summary()
pd.DataFrame(data=df_f)

Unnamed: 0,RawFeatureName,TypeDetected,Dropped,EngineeredFeatureCount,Transformations
0,hotel,Categorical,No,1,[ModeCatImputer-StringCast-LabelEncoder]
1,stays_in_weekend_nights,Categorical,No,16,[StringCast-CharGramCountVectorizer]
2,stays_in_week_nights,Categorical,No,31,[StringCast-CharGramCountVectorizer]
3,adults,Categorical,No,13,[StringCast-CharGramCountVectorizer]
4,children,Categorical,No,5,[StringCast-CharGramCountVectorizer]
5,babies,Categorical,No,4,[StringCast-CharGramCountVectorizer]
6,meal,Categorical,No,5,[StringCast-CharGramCountVectorizer]
7,country,Categorical,No,172,[StringCast-CharGramCountVectorizer]
8,market_segment,Categorical,No,8,[StringCast-CharGramCountVectorizer]
9,distribution_channel,Categorical,No,5,[StringCast-CharGramCountVectorizer]


In [16]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
HotelBookingDemand,AutoML_85aac5b0-fc10-43db-b9bc-a1f2f2e5d33c_38,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [32]:
client = ExplanationClient.from_run(best_run)
global_explanation = client.download_model_explanation(top_k=5)
local_importance_values = global_explanation.local_importance_values
expected_values = global_explanation.expected_values

In [31]:
global_explanation_topk = client.download_model_explanation(top_k=5)
global_importance_values = global_explanation_topk.get_ranked_global_values()
global_importance_names = global_explanation_topk.get_ranked_global_names()

In [33]:
print('global importance values: {}'.format(global_importance_values))
print('global importance names: {}'.format(global_importance_names))

global importance values: [1.1899616700014413, 0.7297750812134504, 0.6359551754229321, 0.361313226932195, 0.3561995833358805]
global importance names: ['deposit_type_CharGramCountVectorizer_Non Refund', 'country_CharGramCountVectorizer_PRT', 'market_segment_CharGramCountVectorizer_Online TA', 'lead_time_MeanImputer', 'total_of_special_requests_CharGramCountVectorizer_0']


### ONNX Model

In [27]:
onnx_best_run, onnx_model = remote_run.get_output(return_onnx_model=True)

onnx_fl_path = "./best_model.onnx"
OnnxConverter.save_onnx_model(onnx_model, onnx_fl_path)



## Model Deployment

In [28]:
best_run_metrics = best_run.get_metrics()

for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

mymodel = best_run.register_model(model_name='hoteldemand_automl', model_path='outputs/model.pkl', description = description, tags = tags, properties={'accuracy': best_run_metrics['accuracy']})

norm_macro_recall 0.710919851655667
precision_score_macro 0.8722395088459242
precision_score_weighted 0.8734535139218588
average_precision_score_weighted 0.9503900665287214
matthews_correlation 0.72750595380261
accuracy 0.8738075313807532
AUC_weighted 0.946336007963732
recall_score_macro 0.8554599258278335
AUC_macro 0.946336007963732
log_loss 0.2842906888065265
average_precision_score_macro 0.9447937880454834
f1_score_macro 0.862391040546894
f1_score_micro 0.8738075313807532
precision_score_micro 0.8738075313807532
AUC_micro 0.9508879133768666
weighted_accuracy 0.8899741070262062
recall_score_micro 0.8738075313807532
f1_score_weighted 0.8723547646796405
average_precision_score_micro 0.953093448777592
balanced_accuracy 0.8554599258278335
recall_score_weighted 0.8738075313807532
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_85aac5b0-fc10-43db-b9bc-a1f2f2e5d33c_38/accuracy_table
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_85aac5b0-fc10-43db-b9bc-a1f2f2e5d33c_38

In [29]:
mymodel

Model(workspace=Workspace.create(name='quick-starts-ws-134555', subscription_id='cdbe0b43-92a0-4715-838a-f2648cc7ad21', resource_group='aml-quickstarts-134555'), name=hoteldemand_automl, id=hoteldemand_automl:1, version=1, tags={}, properties={'accuracy': '0.8738075313807532'})

In [34]:
service_name = "myautomlbestrunv3" # Services Name

myenv = Environment.from_conda_specification(name="env", file_path="myenv.yml")
myenv.register(workspace=ws)

# Combine scoring script & environment in Inference configuration
inference_config = InferenceConfig(entry_script="score.py", environment=myenv) # We are going to use score.py from AML

# Set deployment configuration
deployment_config = AciWebservice.deploy_configuration(cpu_cores =2, memory_gb =4, auth_enabled=True, enable_app_insights=True) #Enable logging


# Define the model, inference, & deployment configuration and web service name and location to deploy
service = Model.deploy(
    workspace = ws,
    name = service_name,
    models = [mymodel],
    inference_config = inference_config,
    deployment_config = deployment_config,
    overwrite=True)

service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running......................................................................................................................................................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [35]:
print(ws.webservices)

service = Webservice(ws, service_name)
print(service.get_logs())

{'myautomlbestrunv3': AciWebservice(workspace=Workspace.create(name='quick-starts-ws-134555', subscription_id='cdbe0b43-92a0-4715-838a-f2648cc7ad21', resource_group='aml-quickstarts-134555'), name=myautomlbestrunv3, image_id=None, compute_type=None, state=ACI, scoring_uri=None, tags=http://d65bed7a-8cdc-4f0f-b09a-007c9b88fcda.southcentralus.azurecontainer.io/score, properties={}, created_by={'hasInferenceSchema': 'True', 'hasHttps': 'False'})}
2021-01-13T15:14:20,049599583+00:00 - rsyslog/run 
2021-01-13T15:14:20,049680384+00:00 - iot-server/run 
2021-01-13T15:14:20,049908785+00:00 - gunicorn/run 
2021-01-13T15:14:20,053373605+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-en

In [36]:
print(service.state)
print(service.scoring_uri)
print(service.swagger_uri)

primary, secondary = service.get_keys()
print(primary)

Healthy
http://d65bed7a-8cdc-4f0f-b09a-007c9b88fcda.southcentralus.azurecontainer.io/score
http://d65bed7a-8cdc-4f0f-b09a-007c9b88fcda.southcentralus.azurecontainer.io/swagger.json
jidoOdaezQxBH5aqMoZlit0YsScWAVrG


In [41]:
scoring_uri = service.scoring_uri
key = primary

data = {
    "data": [
     {
          "hotel": "Resort Hotel",
          "lead_time": 85,
          "stays_in_weekend_nights": 0,
          "stays_in_week_nights": 3,
          "adults": 2,
          "children": 0,
          "babies": 0,
          "meal": "BB",
          "country": "PRT",
          "market_segment": "Online TA",
          "distribution_channel": "TA/TO",
          "is_repeated_guest": 0,
          "booking_changes": 0,
          "deposit_type": "No Deposit",
          "agent": 240,
          "company": 0,
          "days_in_waiting_list": 0,
          "customer_type": "Trasient",
          "adr": 82,
          "required_car_parking_spaces": 1,
          "total_of_special_requests": 1,
          "room": 1,
          "net_cancelled": 0
     }
   ]
}

    
input_data = json.dumps(data)
with open("data.json", "w") as _f:
    _f.write(input_data)

headers = {"Content-Type": "application/json"}
    
headers["Authorization"] = f"Bearer {key}"

resp = requests.post(scoring_uri, input_data, headers=headers)
result = json.loads(resp.json())

print("Will the hotel booking be canceled? {}".format("Yes" if int(result['result'][0]) == 1 else "No"))

Will the hotel booking be canceled? No


In [38]:
print(service.get_logs()) #logs

2021-01-13T15:14:20,049599583+00:00 - rsyslog/run 
2021-01-13T15:14:20,049680384+00:00 - iot-server/run 
2021-01-13T15:14:20,049908785+00:00 - gunicorn/run 
2021-01-13T15:14:20,053373605+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

In [42]:
service.delete()