# Hyperparameter Tuning using HyperDrive


In [1]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os
import warnings
warnings.filterwarnings('ignore')

## Dataset

### Overview

"Airbnb for Boston with fraud detcetion" data was downloaded from Kaggle with the following link:

https://www.kaggle.com/datasets/hawkingcr/airbnb-for-boston-with-fraud-detection/download?datasetVersionNumber=1

The downloaded file was saved as "output.csv" in the "data" directory. The dataset aims to classify whether an Airbnb listing is a fraud or not.

A notebook file named "data_process.ipyng" was created to perform some pre-processing on the data. Firstly, a correlation analysis was conducted with the target column "fraud" to identify and remove some non-significant features. Next, the data was split into "train.csv" and "test.csv" sets, and the balance of the training data was examined. Due to the class imbalance in the training target, an upsampling technique was applied to address this imbalance.

A script file named "data_set.py" was created in the scripts directory. Two functions getTrainingDataset() and getTestDataset() are defined to register traing and test data as dataset in the workspace.

### Get Training and Test Datasets

In [2]:
from scripts.data_set import getTrainingDataset, getTestDataset

ws = Workspace.from_config()
experiment_name = 'udacity-aml-capstone-hyperparameter'

experiment=Experiment(ws, experiment_name)
train_ds = getTrainingDataset(ws)
test_ds = getTestDataset(ws)

found existing traing dataset
Validating arguments.
Arguments validated.
Uploading file to airbnb_boston
Uploading an estimated of 2 files
Uploading ./tmp_dir/test.csv
Uploaded ./tmp_dir/test.csv, 1 files out of an estimated total of 2
Uploading ./tmp_dir/train.csv
Uploaded ./tmp_dir/train.csv, 2 files out of an estimated total of 2
Uploaded 2 files
Creating new dataset
datastore test data path:  airbnb_boston/test.csv


In [3]:
train_ds.to_pandas_dataframe().head()

Unnamed: 0,host_response_rate,host_identity_verified,host_total_listings_count,is_location_exact,property_type,accommodates,price,minimum_nights,number_of_reviews,review_scores_rating,instant_bookable,cancellation_policy,reviews_per_month,fraud
0,95,1,3,1,8,2,6500,2,8,93.0,0,1,0.63,1
1,100,1,1,1,0,8,50000,1,88,98.0,0,1,4.2,1
2,100,1,1,1,8,2,9000,1,192,95.0,0,1,5.58,1
3,90,1,1,1,0,2,11500,1,54,88.0,1,2,3.58,1
4,92,0,8,1,2,6,27500,2,29,91.0,1,2,0.72,1


In [4]:
test_ds.to_pandas_dataframe().head()

Unnamed: 0,host_response_rate,host_identity_verified,host_total_listings_count,is_location_exact,property_type,accommodates,price,minimum_nights,number_of_reviews,review_scores_rating,instant_bookable,cancellation_policy,reviews_per_month,fraud
0,100,0,1,1,0,2,5000,15,0,100.0,1,2,1.0,0
1,100,1,5,0,0,4,25600,2,5,96.0,1,3,3.06,0
2,98,1,5,1,8,16,20000,1,82,95.0,0,2,4.32,1
3,100,1,2,1,0,2,25000,7,2,80.0,0,2,0.08,0
4,93,1,17,1,0,1,3900,30,1,100.0,0,2,0.43,0


### Create or Attach an AmlCompute cluster

In [5]:
cluster_name = "my-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_D2_V2", max_nodes=4
    )
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Hyperdrive Configuration

Logistic Regression model is employed here. I picked four tuning parameters. Two are related to regularization, i.e. inverse of regularization strength and penalty term. 
Two are related to optimization, i.e. solver for optimization method and max number of iterations for solver to converge.

To tune parameters, I picked Random sampling method to handle both continuous and discrete parameters efficiently.

For early termination policy, agressive BanditPolicy is chosen with a small slack_factor of 0.1


In [6]:
# Early termination policy. 
early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

# Tuning parameters
param_sampling = RandomParameterSampling(
    {
        '--C': uniform(0.5,1.5),
        '--max_iter': choice(75, 100, 125),
        '--penalty': choice('l1','l2'),
        '--solver': choice('lbfgs','liblinear','newton-cg',
                           'newton-cholesky','sag','saga')
    }
)

#if "scripts" not in os.listdir():
#    os.mkdir("./scripts")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='hyperdrive_env.yml')

estimator = ScriptRunConfig(source_directory='./scripts',
                      script='logistic_regression.py',
                      compute_target=compute_target,
                      environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_run_config = HyperDriveConfig(run_config=estimator,
                                     hyperparameter_sampling=param_sampling,
                                     policy=early_termination_policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)


In [7]:
hyperdrive_run = experiment.submit(config=hyperdrive_run_config)

## Run Details


In [8]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model


In [9]:
# Get best run

hyperdrive_run.wait_for_completion(show_output=True)
assert(hyperdrive_run.get_status() == "Completed")
best_run = hyperdrive_run.get_best_run_by_primary_metric()


RunId: HD_a0b33230-5378-46db-b0c9-24e17caf10a5
Web View: https://ml.azure.com/runs/HD_a0b33230-5378-46db-b0c9-24e17caf10a5?wsid=/subscriptions/1b944a9b-fdae-4f97-aeb1-b7eea0beac53/resourcegroups/aml-quickstarts-240304/workspaces/quick-starts-ws-240304&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Execution Summary
RunId: HD_a0b33230-5378-46db-b0c9-24e17caf10a5
Web View: https://ml.azure.com/runs/HD_a0b33230-5378-46db-b0c9-24e17caf10a5?wsid=/subscriptions/1b944a9b-fdae-4f97-aeb1-b7eea0beac53/resourcegroups/aml-quickstarts-240304/workspaces/quick-starts-ws-240304&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254



In [10]:
# show best run

print("best metrics: ",best_run.get_metrics())


best metrics:  {'Regularization Strength:': 0.9591536749171734, 'Max iterations:': 75, 'Accuracy': 0.8450390189520625}


In [11]:
print(best_run.get_file_names())
# print("run env: ",best_run.get_environment())
# print(best_run.get_details())


['azureml-logs/20_image_build_log.txt', 'logs/azureml/dataprep/0/backgroundProcess.log', 'logs/azureml/dataprep/0/backgroundProcess_Telemetry.log', 'logs/azureml/dataprep/0/rslex.log.2023-08-15-22', 'outputs/model/model.pkl', 'system_logs/cs_capability/cs-capability.log', 'system_logs/hosttools_capability/hosttools-capability.log', 'system_logs/lifecycler/execution-wrapper.log', 'system_logs/lifecycler/lifecycler.log', 'system_logs/metrics_capability/metrics-capability.log', 'system_logs/snapshot_capability/snapshot-capability.log', 'user_logs/std_log.txt']


In [12]:
import joblib

output_dir = "./outputs"
if os.path.exists(output_dir) == False:
    os.mkdir(output_dir)
    
# download and save best model

best_model_name = output_dir + "/best_hyperdrive_model.pkl"
best_run.download_file("outputs/model/model.pkl",best_model_name)


In [13]:
best_model = joblib.load(best_model_name)

In [14]:

print(best_model)


LogisticRegression(C=0.9591536749171734, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=75, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


### Prepare test data

In [15]:
import pandas as pd
from sklearn.metrics import confusion_matrix

src_dir = "./data"
test_file = src_dir + "/test.csv";
df_test = pd.read_csv(test_file)
df_test = df_test[pd.notnull(df_test['fraud'])]

y_test = df_test['fraud']
X_test = df_test.drop(['fraud'], axis=1)


### Predict with best model

In [16]:
ypred = best_model.predict(X_test)
cm = confusion_matrix(y_test, ypred)

pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

Unnamed: 0,0,1
0,612,86
1,53,146


## Model Registration


In [17]:
model_name = "airbnb-boston-hyperparameter"

description = "logistic_regression model to predict airbnb fraud listing"

registered_model = best_run.register_model(model_name=model_name,
                        model_path="outputs/model",description=description)

print("registered model: ",registered_model)


registered model:  Model(workspace=Workspace.create(name='quick-starts-ws-240304', subscription_id='1b944a9b-fdae-4f97-aeb1-b7eea0beac53', resource_group='aml-quickstarts-240304'), name=airbnb-boston-hyperparameter, id=airbnb-boston-hyperparameter:1, version=1, tags={}, properties={})


## Cleanup Resources

In [None]:
compute_target.delete()