# Automated ML

In [1]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.run import Run
from azureml.train.automl import AutoMLConfig
from azureml.exceptions import ComputeTargetException
from azureml.widgets import RunDetails
import pandas as pd
import logging
import json
import joblib


## Dataset

### Overview

"Airbnb for Boston with fraud detcetion" data was downloaded from Kaggle with the following link:

https://www.kaggle.com/datasets/hawkingcr/airbnb-for-boston-with-fraud-detection/download?datasetVersionNumber=1

The downloaded file was saved as "output.csv" in the "data" directory. The dataset aims to classify whether an Airbnb listing is a fraud or not.

A notebook file named "data_process.ipyng" was created to perform some pre-processing on the data. Firstly, a correlation analysis was conducted with the target column "fraud" to identify and remove some non-significant features. Next, the data was split into "train.csv" and "test.csv" sets, and the balance of the training data was examined. Due to the class imbalance in the training target, an upsampling technique was applied to address this imbalance

### Create Training Dataset

In [None]:
from scripts.data_set import getTrainingDataset

ws = Workspace.from_config()

experiment_name = 'udacity-aml-capstone-automl'
experiment=Experiment(ws, experiment_name)

train_ds = getTrainingDataset(ws)
train_ds.to_pandas_dataframe().head()


### Create or Attach an AmlCompute cluster

In [None]:

cluster_name = "my-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_D2_V2", max_nodes=4
    )
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)


## AutoML Configuration

iteration_timeout_minutes: Time limit in minutes for each iteration. Increase this value for larger datasets that need more time for each iteration. Here it is set to 10 minutes for the relative small dataset.

experiment_timeout_hours: Maximum amount of time that all iterations combined can take before the experiment terminates. Here it is set tp 30 minutes for the relative small dataset.

enable_early_stopping: Flag to enable early termination if the score is not improving in the short term.

primary_metric: Since the data is very imbalanced, "AUC_weighted" is chosen here.

featurization: By using auto, the experiment can preprocess the input data (handling missing data, converting text to numeric, etc.)

verbosity: Controls the level of logging.

n_cross_validation: Number of cross validation to perform since validation data is not specified.

In [4]:

automl_settings = {
    "iteration_timeout_minutes": 10,
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 4,
    "enable_early_stopping": True,
    "primary_metric": 'AUC_weighted',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 5
}

automl_config = AutoMLConfig(
    task="classification",
    compute_target=compute_target,
    training_data=train_ds,
    label_column_name="fraud",
    enable_onnx_compatible_models=True,
    **automl_settings)

In [None]:
# Submit experiment
auto_run = experiment.submit(automl_config)

## Run Details


In [None]:
RunDetails(auto_run).show()

## Best Model



In [None]:
# Retrieve best automl model.

auto_run.wait_for_completion(show_output=True)
assert(auto_run.get_status() == "Completed")

best_auto_run, best_model = auto_run.get_output()
# best_auto_child = auto_run.get_best_child()

# Save the best model

output_dir = "./outputs"
if os.path.exists(output_dir) == False:
    os.mkdir(output_dir)
best_model_file = output_dir + "/best_model.pkl"
joblib.dump(best_model,best_model_file)



In [None]:
# print run properties

print(best_auto_run.get_file_names())
run_env = auto_run.get_environment()
print (run_env)
best_auto_run_env = best_auto_run.get_environment()
print(best_auto_run_env)
# print(best_auto_run.get_details())
print(best_auto_run.get_properties())

#env_file = output_dir + "/env_file"
# if json, serializing first
# json_object = json.dumps(best_auto_run_env, indent=4)
#with open(env_file,"w") as f
    #f.write(best_auto_run_env)
    #f.write(json_object)


In [None]:
#print best model properties

print(best_model.get_model_path())
print(best_model.get_sas_urls())
print(best_model.print_configuration())

## Predict with best model

In [None]:
from sklearn.metrics import confusion_matrix

test_file = src_dir + "/test.csv";
df_test = pd.read_csv(test_file)
df_test = df_test[pd.notnull(df_test['fraud'])]

y_test = df_test['fraud']
X_test = df_test.drop(['fraud'], axis=1)

ypred = best_model.predict(X_test)


In [None]:
cm = confusion_matrix(y_test, ypred)
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

### Retrieve and save ONNX Model

In [None]:
from azureml.automl.runtime.onnx_convert import OnnxConverter

bestrun, onnx_mdl = auto_run.get_output(return_onnx_model=True)

onnx_fl_path = output_dir+"/best_model.onnx"
OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)


### Predict with ONNX model


In [None]:
import sys
# import json
from azureml.automl.core.onnx_convert import OnnxConvertConstants
from azureml.train.automl import constants
from azureml.automl.runtime.onnx_convert import OnnxInferenceHelper

print(constants.MODEL_RESOURCE_PATH_ONNX)

def get_onnx_res(run):
    res_path = output_dir + "/onnx_resource.json"
    run.download_file(
        name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path
    )
    with open(res_path) as f:
        result = json.load(f)
    return result


if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:
    # test_df = test_dataset.to_pandas_dataframe()
    mdl_bytes = onnx_mdl.SerializeToString()
    onnx_result = get_onnx_res(bestrun)

    onnxrt_helper = OnnxInferenceHelper(mdl_bytes, onnx_result)
    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(X_test)
    #print(pred_onnx)
    #print(pred_prob_onnx)
else:
    print("Please use Python version 3.6 or higher to run the inference helper.")
    

In [None]:
cm = confusion_matrix(y_test, pred_onnx)
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

## Model Registration


In [None]:

model_name = best_auto_run.properties["model_name"]
#print(model_name)
description = "AutoML Model to predict Airbnb fraud listing"
tags = None
registered_model = auto_run.register_model(
    model_name=model_name, description=description, tags=tags
)

print(
    auto_run.model_id
) 

print("registered model: ",registered_model)

## Deploy Webservice

In [None]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment

# download score file

script_file_name = output_dir + "/score.py"
best_auto_run.download_file("outputs/scoring_file_v_1_0_0.py", script_file_name)

inference_config = InferenceConfig(
    environment=best_auto_run.get_environment(), entry_script=script_file_name
)

aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=2,
    memory_gb=2,
    #tags={"area": "bmData", "type": "automl_classification"},
    tags={"type": "automl_classification"},
    description="Automl Classification Service",
)

aci_service_name = model_name.lower()
print(aci_service_name)
aci_service = Model.deploy(ws, aci_service_name, [registered_model], inference_config, aciconfig)
aci_service.wait_for_deployment(True)
print(aci_service.scoring_uri)
print(aci_service.state)

### Print the logs of the web service

In [None]:
aci_service.get_logs()

### Send a request to the web service

In [None]:
from numpy import array
import requests

X_test_json = X_test.to_json(orient="records")
#data = '{"data": ' + X_test_json + "}"
data = '{"data": ' + X_test_json + ', "method": "predict"}'
#print("test data:", data)
headers = {"Content-Type": "application/json"}

resp = requests.post(aci_service.scoring_uri, data, headers=headers)

y_pred = json.loads(json.loads(resp.text))["result"]
#print(y_pred)
#print(y_test)
actual = array(y_test)
#actual = actual[:, 0]
#print(len(y_pred), " ", len(actual))
#print(actual)


In [None]:
cm = confusion_matrix(actual, ypred)
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

## Cleanup Resources

In [None]:
aci_service.delete()

compute_target.delete()
