# Hyperparameter Tuning using HyperDrive


In [None]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

## Dataset

### Overview

"Airbnb for Boston with fraud detcetion" data was downloaded from Kaggle with the following link:

https://www.kaggle.com/datasets/hawkingcr/airbnb-for-boston-with-fraud-detection/download?datasetVersionNumber=1

The downloaded file was saved as "output.csv" in the "data" directory. The dataset aims to classify whether an Airbnb listing is a fraud or not.

A notebook file named "data_process.ipyng" was created to perform some pre-processing on the data. Firstly, a correlation analysis was conducted with the target column "fraud" to identify and remove some non-significant features. Next, the data was split into "train.csv" and "test.csv" sets, and the balance of the training data was examined. Due to the class imbalance in the training target, an upsampling technique was applied to address this imbalance

In [None]:
from scripts.data_set import getTrainingDataset, getTestDataset

ws = Workspace.from_config()
experiment_name = 'udacity-aml-capstone-hyperparameter'

experiment=Experiment(ws, experiment_name)
train_ds = getTrainingDataset(ws)
print(train_ds.to_pandas_dataframe().head())
test_ds = getTestDataset(ws)
print(test_ds.to_pandas_dataframe().head())

### Create or Attach an AmlCompute cluster

In [None]:
cluster_name = "my-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_D2_V2", max_nodes=4
    )
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)

## Hyperdrive Configuration

Logistic Regression model is employed here. There are two tuning parameters, inverse of regularization strength and max iteration.  I picked Random sampling method over Grid sampling for both continuous and discrete, and over Bayesian sampling for efficiency.

For early termination policy, agressive BanditPolicy is chosen with a small slack_factor of 0.1.

Since the data is imbalanced, primary metric "AUC_wighted" is chosen.


In [None]:
# Early termination policy. 
early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

# Tuning parameters
param_sampling = RandomParameterSampling(
    {
        '--C': uniform(0.5,1.5),
        '--max_iter': choice(75, 100, 125)
    }
)

if "scripts" not in os.listdir():
    os.mkdir("./scripts")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

estimator = ScriptRunConfig(source_directory='./scripts',
                      script='logistic_regression.py',
                      compute_target=compute_target,
                      environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_run_config = HyperDriveConfig(run_config=estimator,
                                     hyperparameter_sampling=param_sampling,
                                     policy=early_termination_policy,
                                     # primary_metric_name='Accuracy',
                                     primary_metric_name='AUC_weighted',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)


In [None]:
hyperdrive_run = experiment.submit(config=hyperdrive_config)

## Run Details


In [None]:
RunDetails(hyperdrive_run).show()

## Best Model


In [None]:
# Get best run

hyperdrive_run.wait_for_completion(show_output=True)
assert(hyperdrive_run.get_status() == "Completed")

best_run = hyperdrive_run.get_best_run_by_primary_metric()


In [None]:
# show best run

# print(best_run.get_metrics())
print(best_run.get_environment())
print(best_run.get_file_names())
# print(best_run.get_details())
print(best_run.get_properties())


In [None]:
import joblib

output_dir = "./outputs"
if os.path.exists(output_dir) == False:
    os.mkdir(output_dir)

best_metrics = best_run.get_metrics()
#best_metrics_file = output_dir + "/hyperdrive_metircs.json"
#joblib.dump(best_metrics,best_metrics_file)
    
# download best model

best_model_name = output_dir + "/best_model.pkl"
best_run.download_file("outputs/model.pkl",best_model_name)


In [None]:
best_model = joblib.load(best_model_name)

In [None]:
print(best_metrics)
print(best_model)


### Prepare test data

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix

src_dir = "./data"
test_file = src_dir + "/test.csv";
df_test = pd.read_csv(test_file)
df_test = df_test[pd.notnull(df_test['fraud'])]

y_test = df_test['fraud']
X_test = df_test.drop(['fraud'], axis=1)


### Predict with best model

In [None]:
ypred = best_model.predict(X_test)
cm = confusion_matrix(y_test, ypred)

pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

## Model Registration


In [None]:
model_name = "airbnb-boston-hyperparameter"

#description = "logistic_regression model on Airbnb boston to predict fraud listing"
#registered_model = best_run.register_model(model_name=model_name)
registered_model = best_run.register_model(model_name=model_name,model_path='outputs/model')

print("registered model: ",registered_model)


## Webservice Deployment

In [None]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment

script_file_name = "./scripts/score.py"
inference_config = InferenceConfig(
    environment=best_run.get_environment(), entry_script=script_file_name
)

aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=2,
    memory_gb=2,
    tags={"type": "logistic regression"},
    description="Logistic Regression Classification",
)

aci_service_name = model_name.lower()
print(aci_service_name)
aci_service = Model.deploy(ws, aci_service_name, [registered_model], inference_config, aciconfig)
aci_service.wait_for_deployment(True)
print(aci_service.scoring_uri)
print(aci_service.state)

### Send a request to the web service

In [None]:
from numpy import array
import requests
import json

X_test_json = X_test.to_json(orient="records")
#data = '{"data": ' + X_test_json + "}"
data = '{"data": ' + X_test_json + ', "method": "predict"}'
#print("test data:", data)
headers = {"Content-Type": "application/json"}

resp = requests.post(aci_service.scoring_uri, data, headers=headers)

y_pred = json.loads(json.loads(resp.text))["result"]
#print(y_pred)

#print(y_test)
actual = array(y_test)
#actual = actual[:, 0]
print(len(y_pred), " ", len(actual))
#print(actual)

cm = confusion_matrix(actual, ypred)

pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

### Print the logs of the web service

In [None]:
aci_service.get_logs()

## Cleanup Resources

In [None]:
aci_service.delete()
compute_target.delete()