# Automated ML

In [1]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.run import Run
from azureml.train.automl import AutoMLConfig
from azureml.exceptions import ComputeTargetException
from azureml.widgets import RunDetails
import pandas as pd
import logging
import json
import joblib


## Dataset

### Overview

"Airbnb for Boston with fraud detcetion" data was downloaded from Kaggle with the following link:

https://www.kaggle.com/datasets/hawkingcr/airbnb-for-boston-with-fraud-detection/download?datasetVersionNumber=1

The downloaded file was saved as "output.csv" in the "data" directory. The dataset aims to classify whether an Airbnb listing is a fraud or not.

A notebook file named "data_process.ipyng" was created to perform some pre-processing on the data. Firstly, a correlation analysis was conducted with the target column "fraud" to identify and remove some non-significant features. Next, the data was split into "train.csv" and "test.csv" sets, and the balance of the training data was examined. Due to the class imbalance in the training target, an upsampling technique was applied to address this imbalance

### Create Training Dataset

In [2]:
from scripts.data_set import getTrainingDataset

ws = Workspace.from_config()

experiment_name = 'udacity-aml-capstone-automl'
experiment=Experiment(ws, experiment_name)

train_ds = getTrainingDataset(ws)
train_ds.to_pandas_dataframe().head()


Validating arguments.
Arguments validated.
Uploading file to airbnb_boston
Uploading an estimated of 1 files
Uploading ./tmp_dir/train.csv
Uploaded ./tmp_dir/train.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Creating new dataset
datastore train data path:  airbnb_boston/train.csv
register training dataset


Unnamed: 0,host_response_rate,host_identity_verified,host_total_listings_count,is_location_exact,property_type,accommodates,price,minimum_nights,number_of_reviews,review_scores_rating,instant_bookable,cancellation_policy,reviews_per_month,fraud
0,95,1,3,1,8,2,6500,2,8,93.0,0,1,0.63,1
1,100,1,1,1,0,8,50000,1,88,98.0,0,1,4.2,1
2,100,1,1,1,8,2,9000,1,192,95.0,0,1,5.58,1
3,90,1,1,1,0,2,11500,1,54,88.0,1,2,3.58,1
4,92,0,8,1,2,6,27500,2,29,91.0,1,2,0.72,1


### Create or Attach an AmlCompute cluster

In [3]:

cluster_name = "my-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_D2_V2", max_nodes=4
    )
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)


Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## AutoML Configuration

iteration_timeout_minutes: Time limit in minutes for each iteration. Increase this value for larger datasets that need more time for each iteration. Here it is set to 10 minutes for the relative small dataset.

experiment_timeout_hours: Maximum amount of time that all iterations combined can take before the experiment terminates. Here it is set tp 30 minutes for the relative small dataset.

enable_early_stopping: Flag to enable early termination if the score is not improving in the short term.

primary_metric: Since the data is very imbalanced, "AUC_weighted" is chosen here.

featurization: By using auto, the experiment can preprocess the input data (handling missing data, converting text to numeric, etc.)

verbosity: Controls the level of logging.

n_cross_validation: Number of cross validation to perform since validation data is not specified.

In [4]:

automl_settings = {
    "iteration_timeout_minutes": 10,
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 4,
    "enable_early_stopping": True,
    "primary_metric": 'AUC_weighted',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 5
}

automl_config = AutoMLConfig(
    task="classification",
    compute_target=compute_target,
    training_data=train_ds,
    label_column_name="fraud",
    enable_onnx_compatible_models=True,
    **automl_settings)

In [5]:
# Submit experiment
auto_run = experiment.submit(automl_config)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-aml-capstone-automl,AutoML_d9a7c24f-49f6-4d75-9668-6bf0c6b356b4,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


## Run Details


In [6]:
RunDetails(auto_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model



In [7]:
# Retrieve best automl model.

auto_run.wait_for_completion(show_output=True)
assert(auto_run.get_status() == "Completed")

best_auto_run, best_model = auto_run.get_output()
# best_auto_child = auto_run.get_best_child()

# Save the best model

output_dir = "./outputs"
if os.path.exists(output_dir) == False:
    os.mkdir(output_dir)
best_model_file = output_dir + "/best_model.pkl"
joblib.dump(best_model,best_model_file)



Experiment,Id,Type,Status,Details Page,Docs Page
udacity-aml-capstone-automl,AutoML_d9a7c24f-49f6-4d75-9668-6bf0c6b356b4,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation




********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

********************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

********************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more about high cardinality feat

Package:azureml-automl-runtime, training version:1.52.0.post1, current version:1.51.0.post1
Package:azureml-core, training version:1.52.0, current version:1.51.0
Package:azureml-dataprep, training version:4.11.4, current version:4.10.8
Package:azureml-dataprep-rslex, training version:2.18.4, current version:2.17.12
Package:azureml-dataset-runtime, training version:1.52.0, current version:1.51.0
Package:azureml-defaults, training version:1.52.0, current version:1.51.0
Package:azureml-interpret, training version:1.52.0, current version:1.51.0
Package:azureml-mlflow, training version:1.52.0, current version:1.51.0
Package:azureml-pipeline-core, training version:1.52.0, current version:1.51.0
Package:azureml-responsibleai, training version:1.52.0, current version:1.51.0
Package:azureml-telemetry, training version:1.52.0, current version:1.51.0
Package:azureml-train-automl-client, training version:1.52.0, current version:1.51.0.post1
Package:azureml-train-automl-runtime, training version:1.

['./outputs/best_model.pkl']

In [33]:
print(best_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=False, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=True, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/mount...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                     

In [12]:
# print run properties, save run env

print(best_auto_run.get_file_names())
best_auto_run_env = best_auto_run.get_environment()
print("best run_env: ", best_auto_run_env)
# print(best_auto_run.get_details())
# print("properties: ",best_auto_run.get_properties())

env_file = output_dir + "/automl_env.yml"
best_auto_run.download_file("outputs/conda_env_v_1_0_0.yml",env_file)


['accuracy_table', 'automl_driver.py', 'confusion_matrix', 'explanation/8ad05459/classes.interpret.json', 'explanation/8ad05459/eval_data_viz.interpret.json', 'explanation/8ad05459/expected_values.interpret.json', 'explanation/8ad05459/features.interpret.json', 'explanation/8ad05459/global_names/0.interpret.json', 'explanation/8ad05459/global_rank/0.interpret.json', 'explanation/8ad05459/global_values/0.interpret.json', 'explanation/8ad05459/local_importance_values.interpret.json', 'explanation/8ad05459/per_class_names/0.interpret.json', 'explanation/8ad05459/per_class_rank/0.interpret.json', 'explanation/8ad05459/per_class_values/0.interpret.json', 'explanation/8ad05459/rich_metadata.interpret.json', 'explanation/8ad05459/true_ys_viz.interpret.json', 'explanation/8ad05459/visualization_dict.interpret.json', 'explanation/8ad05459/ys_pred_proba_viz.interpret.json', 'explanation/8ad05459/ys_pred_viz.interpret.json', 'explanation/a41d6431/classes.interpret.json', 'explanation/a41d6431/exp

## Predict with best model

In [17]:
from sklearn.metrics import confusion_matrix

test_file = "./data/test.csv";
df_test = pd.read_csv(test_file)
df_test = df_test[pd.notnull(df_test['fraud'])]

y_test = df_test['fraud']
X_test = df_test.drop(['fraud'], axis=1)

ypred = best_model.predict(X_test)




In [18]:
cm = confusion_matrix(y_test, ypred)
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

Unnamed: 0,0,1
0,647,51
1,81,118


### Retrieve and save ONNX Model

In [19]:
from azureml.automl.runtime.onnx_convert import OnnxConverter

bestrun, onnx_mdl = auto_run.get_output(return_onnx_model=True)

onnx_fl_path = output_dir+"/best_model.onnx"
OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)


### Predict with ONNX model


In [20]:
import sys
# import json
from azureml.automl.core.onnx_convert import OnnxConvertConstants
from azureml.train.automl import constants
from azureml.automl.runtime.onnx_convert import OnnxInferenceHelper

print(constants.MODEL_RESOURCE_PATH_ONNX)

def get_onnx_res(run):
    res_path = output_dir + "/onnx_resource.json"
    run.download_file(
        name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path
    )
    with open(res_path) as f:
        result = json.load(f)
    return result


if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:
    # test_df = test_dataset.to_pandas_dataframe()
    mdl_bytes = onnx_mdl.SerializeToString()
    onnx_result = get_onnx_res(bestrun)

    onnxrt_helper = OnnxInferenceHelper(mdl_bytes, onnx_result)
    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(X_test)
    #print(pred_onnx)
    #print(pred_prob_onnx)
else:
    print("Please use Python version 3.6 or higher to run the inference helper.")
    

outputs/model_onnx.json


2023-08-05 13:37:18.243719530 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {1} does not match actual shape of {897} for output output_label


In [21]:
cm = confusion_matrix(y_test, pred_onnx)
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

Unnamed: 0,0,1
0,647,51
1,78,121


## Model Registration


In [22]:

model_name = best_auto_run.properties["model_name"]
#print(model_name)
description = "AutoML Model to predict Airbnb fraud listing"
tags = None
registered_model = auto_run.register_model(
    model_name=model_name, description=description, tags=tags
)

print(
    auto_run.model_id
) 

print("registered model: ",registered_model)

AutoMLd9a7c24f414
registered model:  Model(workspace=Workspace.create(name='quick-starts-ws-239768', subscription_id='510b94ba-e453-4417-988b-fbdc37b55ca7', resource_group='aml-quickstarts-239768'), name=AutoMLd9a7c24f414, id=AutoMLd9a7c24f414:1, version=1, tags={}, properties={})


## Deploy Webservice

In [34]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment

# download score file

script_file_name = output_dir + "/score.py"
best_auto_run.download_file("outputs/scoring_file_v_1_0_0.py", script_file_name)

inference_config = InferenceConfig(
    environment=best_auto_run.get_environment(), entry_script=script_file_name
)

aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=2,
    memory_gb=2,
    tags={"type": "automl_classification"},
    description="Automl Classification Service",
)

aci_service_name = model_name.lower()
print(aci_service_name)
aci_service = Model.deploy(ws, aci_service_name, [registered_model], inference_config, aciconfig)
aci_service.wait_for_deployment(True)
print(aci_service.scoring_uri)
print(aci_service.state)

automld9a7c24f414
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2023-08-05 16:48:48+00:00 Creating Container Registry if not exists.
2023-08-05 16:48:49+00:00 Use the existing image.
2023-08-05 16:48:50+00:00 Submitting deployment to compute.
2023-08-05 16:48:54+00:00 Checking the status of deployment automld9a7c24f414..
2023-08-05 16:50:56+00:00 Checking the status of inference endpoint automld9a7c24f414.
Succeeded
ACI service creation operation finished, operation "Succeeded"
http://938bcec3-9ea5-46e6-8d92-062ce25c5108.westus2.azurecontainer.io/score
Healthy


### Print the logs of the web service

In [35]:
aci_service.get_logs()



### Send a request to the web service

In [38]:
from numpy import array
import requests

X_test_json = X_test.to_json(orient="records")
#data = '{"data": ' + X_test_json + "}"
data = '{"data": ' + X_test_json + ', "method": "predict"}'
# print("test data:", data)
headers = {"Content-Type": "application/json"}
print("web service uri: ",aci_service.scoring_uri)
#resp = requests.post(aci_service.scoring_uri, data, headers=headers)
#y_pred = json.loads(json.loads(resp.text))["result"]
#print(y_pred)
#print(y_test)
actual = array(y_test)
#print(len(y_pred), " ", len(actual))
#print(actual)

# sample test data
X_test.head()

web service uri:  http://938bcec3-9ea5-46e6-8d92-062ce25c5108.westus2.azurecontainer.io/score


Unnamed: 0,host_response_rate,host_identity_verified,host_total_listings_count,is_location_exact,property_type,accommodates,price,minimum_nights,number_of_reviews,review_scores_rating,instant_bookable,cancellation_policy,reviews_per_month
0,100,0,1,1,0,2,5000,15,0,100.0,1,2,1.0
1,100,1,5,0,0,4,25600,2,5,96.0,1,3,3.06
2,98,1,5,1,8,16,20000,1,82,95.0,0,2,4.32
3,100,1,2,1,0,2,25000,7,2,80.0,0,2,0.08
4,93,1,17,1,0,1,3900,30,1,100.0,0,2,0.43


In [40]:
resp = requests.post(aci_service.scoring_uri, data, headers=headers)
y_pred = json.loads(json.loads(resp.text))["result"]
cm = confusion_matrix(actual, ypred)
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

Unnamed: 0,0,1
0,647,51
1,81,118


## Cleanup Resources

In [None]:
aci_service.delete()

compute_target.delete()
