# Hyperparameter Tuning using HyperDrive


In [None]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os
import warnings
warnings.filterwarnings('ignore')

## Dataset

### Overview

"Airbnb for Boston with fraud detcetion" data was downloaded from Kaggle with the following link:

https://www.kaggle.com/datasets/hawkingcr/airbnb-for-boston-with-fraud-detection/download?datasetVersionNumber=1

The downloaded file was saved as "output.csv" in the "data" directory. The dataset aims to classify whether an Airbnb listing is a fraud or not.

A notebook file named "data_process.ipyng" was created to perform some pre-processing on the data. Firstly, a correlation analysis was conducted with the target column "fraud" to identify and remove some non-significant features. Next, the data was split into "train.csv" and "test.csv" sets, and the balance of the training data was examined. Due to the class imbalance in the training target, an upsampling technique was applied to address this imbalance.

A script file named "data_set.py" was created in the scripts directory. Two functions getTrainingDataset() and getTestDataset() are defined to register traing and test data as dataset in the workspace.

### Get Training and Test Datasets

In [None]:
from scripts.data_set import getTrainingDataset, getTestDataset

ws = Workspace.from_config()
experiment_name = 'udacity-aml-capstone-hyperparameter'

experiment=Experiment(ws, experiment_name)
train_ds = getTrainingDataset(ws)
test_ds = getTestDataset(ws)

In [None]:
train_ds.to_pandas_dataframe().head()

In [None]:
test_ds.to_pandas_dataframe().head()

### Create or Attach an AmlCompute cluster

In [None]:
cluster_name = "my-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_D2_V2", max_nodes=4
    )
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)

## Hyperdrive Configuration

Logistic Regression model is employed here. I picked four tuning parameters. Two are related to regularization, i.e. inverse of regularization strength and penalty term. 
Two are related to optimization, i.e. solver for optimization method and max number of iterations for solver to converge.

To tune parameters, I picked Random sampling method to handle both continuous and discrete parameters efficiently.

For early termination policy, agressive BanditPolicy is chosen with a small slack_factor of 0.1


In [None]:
# Early termination policy. 
early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

# Tuning parameters
param_sampling = RandomParameterSampling(
    {
        '--C': uniform(0.5,1.5),
        '--max_iter': choice(75, 100, 125),
        '--penalty': choice('l1','l2'),
        '--solver': choice('lbfgs','liblinear','newton-cg',
                           'newton-cholesky','sag','saga')
    }
)

#if "scripts" not in os.listdir():
#    os.mkdir("./scripts")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='hyperdrive_env.yml')

estimator = ScriptRunConfig(source_directory='./scripts',
                      script='logistic_regression.py',
                      compute_target=compute_target,
                      environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_run_config = HyperDriveConfig(run_config=estimator,
                                     hyperparameter_sampling=param_sampling,
                                     policy=early_termination_policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)


In [22]:
hyperdrive_run = experiment.submit(config=hyperdrive_run_config)

## Run Details


In [None]:
RunDetails(hyperdrive_run).show()

## Best Model


In [None]:
# Get best run

hyperdrive_run.wait_for_completion(show_output=True)
assert(hyperdrive_run.get_status() == "Completed")
best_run = hyperdrive_run.get_best_run_by_primary_metric()


In [None]:
# show best run

print("best metrics: ",best_run.get_metrics())


In [None]:
print(best_run.get_file_names())
# print("run env: ",best_run.get_environment())
# print(best_run.get_details())


In [None]:
import joblib

output_dir = "./outputs"
if os.path.exists(output_dir) == False:
    os.mkdir(output_dir)
    
# download and save best model

best_model_name = output_dir + "/best_hyperdrive_model.pkl"
best_run.download_file("outputs/model/model.pkl",best_model_name)


In [None]:
best_model = joblib.load(best_model_name)

In [None]:

print(best_model)


### Prepare test data

In [33]:
import pandas as pd
from sklearn.metrics import confusion_matrix

src_dir = "./data"
test_file = src_dir + "/test.csv";
df_test = pd.read_csv(test_file)
df_test = df_test[pd.notnull(df_test['fraud'])]

y_test = df_test['fraud']
X_test = df_test.drop(['fraud'], axis=1)


### Predict with best model

In [None]:
ypred = best_model.predict(X_test)
cm = confusion_matrix(y_test, ypred)

pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

## Model Registration


In [None]:
model_name = "airbnb-boston-hyperparameter"

description = "logistic_regression model to predict airbnb fraud listing"

registered_model = best_run.register_model(model_name=model_name,
                        model_path="outputs/model",description=description)

print("registered model: ",registered_model)


## Deploy the model in ACI

First we created "score.py" under scripts directory

%%writefile scripts/score.py
import json
import logging
import os
# import pickle
# import numpy as np
import pandas as pd
import joblib

import azureml.automl.core

def init():
    global model

    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'model/model.pkl')

    try:
        logging.info("Loading model from path.")
        model = joblib.load(model_path)
        logging.info("Loading successful.")
    except Exception as e:
        logging.exception("Exception on load model")
        raise

def run(data, method="predict"):
    try:
        if method == "predict_proba":
            result = model.predict_proba(data)
        elif method == "predict":
            result = model.predict(data)
        else:
            raise Exception(f"Invalid predict method argument received ({method})")
        if isinstance(result, pd.DataFrame):
            result = result.values
        return json.dumps({"result": result.tolist()})
    except Exception as e:
        result = str(e)
        return json.dumps({"error": result})


### Create myenv.yml

Create an environment file so that Azure Machine Learning can install the necessary packages in the Docker image which are required by your scoring script. 


In [None]:
from azureml.core.runconfig import CondaDependencies

cd = CondaDependencies.create()
cd.add_conda_package('numpy')
cd.add_conda_package('pandas')
cd.add_pip_package('scikit-learn')
cd.add_pip_package("azureml-defaults")
# cd.add_pip_package("protobuf==3.20.1")
cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')

print(cd.serialize_to_string())

### Deploy to ACI

Create the inference configuration and deployment configuration and deploy to ACI. 

In [None]:
from azureml.core.webservice import AciWebservice
from azureml.core.model import InferenceConfig
from azureml.core.model import Model

myenv = Environment.from_conda_specification(name="myenv", file_path="myenv.yml")
inference_config = InferenceConfig(entry_script="scripts/score.py", environment=myenv)

aciconfig = AciWebservice.deploy_configuration(cpu_cores=2, 
                                               memory_gb=2, 
                                               tags={'name':'logistic_regression'},
                                               description='log_reg classification')

service = Model.deploy(workspace=ws, 
                           name='hyperdrive_model', 
                           models=[registered_model], 
                           inference_config=inference_config, 
                           deployment_config=aciconfig)

service.wait_for_deployment(True)
print(service.state)

## Cleanup Resources

In [None]:
compute_target.delete()