# Hyperparameter Tuning using HyperDrive


In [3]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

## Dataset

### Overview

"Airbnb for Boston with fraud detcetion" data was downloaded from Kaggle with the following link:

https://www.kaggle.com/datasets/hawkingcr/airbnb-for-boston-with-fraud-detection/download?datasetVersionNumber=1

The downloaded file was saved as "output.csv" in the "data" directory. The dataset aims to classify whether an Airbnb listing is a fraud or not.

A notebook file named "data_process.ipyng" was created to perform some pre-processing on the data. Firstly, a correlation analysis was conducted with the target column "fraud" to identify and remove some non-significant features. Next, the data was split into "train.csv" and "test.csv" sets, and the balance of the training data was examined. Due to the class imbalance in the training target, an upsampling technique was applied to address this imbalance

In [4]:
from scripts.data_set import getTrainingDataset, getTestDataset

ws = Workspace.from_config()
experiment_name = 'udacity-aml-capstone-hyperparameter'

experiment=Experiment(ws, experiment_name)
train_ds = getTrainingDataset(ws)
print(train_ds.to_pandas_dataframe().head())
test_ds = getTestDataset(ws)
print(test_ds.to_pandas_dataframe().head())

found existing traing dataset
   host_response_rate  host_identity_verified  host_total_listings_count  \
0                  95                       1                          3   
1                 100                       1                          1   
2                 100                       1                          1   
3                  90                       1                          1   
4                  92                       0                          8   

   is_location_exact  property_type  accommodates  price  minimum_nights  \
0                  1              8             2   6500               2   
1                  1              0             8  50000               1   
2                  1              8             2   9000               1   
3                  1              0             2  11500               1   
4                  1              2             6  27500               2   

   number_of_reviews  review_scores_rating  instant_book

### Create or Attach an AmlCompute cluster

In [5]:
cluster_name = "my-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_D2_V2", max_nodes=4
    )
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Hyperdrive Configuration

Logistic Regression model is employed here. There are two tuning parameters, inverse of regularization strength and max iteration.  I picked Random sampling method over Grid sampling for both continuous and discrete, and over Bayesian sampling for efficiency.

For early termination policy, agressive BanditPolicy is chosen with a small slack_factor of 0.1


In [21]:
# Early termination policy. 
early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

# Tuning parameters
param_sampling = RandomParameterSampling(
    {
        '--C': uniform(0.5,1.5),
        '--max_iter': choice(75, 100, 125)
    }
)

if "scripts" not in os.listdir():
    os.mkdir("./scripts")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

estimator = ScriptRunConfig(source_directory='./scripts',
                      script='logistic_regression.py',
                      compute_target=compute_target,
                      environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_run_config = HyperDriveConfig(run_config=estimator,
                                     hyperparameter_sampling=param_sampling,
                                     policy=early_termination_policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)


In [22]:
hyperdrive_run = experiment.submit(config=hyperdrive_run_config)

## Run Details


In [23]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model


In [24]:
# Get best run

hyperdrive_run.wait_for_completion(show_output=True)
assert(hyperdrive_run.get_status() == "Completed")

best_run = hyperdrive_run.get_best_run_by_primary_metric()


RunId: HD_909bdc5c-2568-4671-9114-2c666d292693
Web View: https://ml.azure.com/runs/HD_909bdc5c-2568-4671-9114-2c666d292693?wsid=/subscriptions/510b94ba-e453-4417-988b-fbdc37b55ca7/resourcegroups/aml-quickstarts-239768/workspaces/quick-starts-ws-239768&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Execution Summary
RunId: HD_909bdc5c-2568-4671-9114-2c666d292693
Web View: https://ml.azure.com/runs/HD_909bdc5c-2568-4671-9114-2c666d292693?wsid=/subscriptions/510b94ba-e453-4417-988b-fbdc37b55ca7/resourcegroups/aml-quickstarts-239768/workspaces/quick-starts-ws-239768&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254



In [29]:
# show best run

# print(best_run.get_file_names())
print("best metrics: ",best_run.get_metrics())
# print("run env: ",best_run.get_environment())
# print(best_run.get_details())
# print("run properties: ",best_run.get_properties())


best metrics:  {'Regularization Strength:': 0.8182023950907832, 'Max iterations:': 100, 'Accuracy': 0.8461538461538461}


In [30]:
import joblib

output_dir = "./outputs"
if os.path.exists(output_dir) == False:
    os.mkdir(output_dir)
    
# download and save best model

best_model_name = output_dir + "/best_model.pkl"
best_run.download_file("outputs/model.pkl",best_model_name)


In [31]:
best_model = joblib.load(best_model_name)

Trying to unpickle estimator LogisticRegression from version 1.3.0 when using version 0.22.1. This might lead to breaking code or invalid results. Use at your own risk.


In [32]:

print(best_model)


LogisticRegression(C=0.8182023950907832, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


### Prepare test data

In [33]:
import pandas as pd
from sklearn.metrics import confusion_matrix

src_dir = "./data"
test_file = src_dir + "/test.csv";
df_test = pd.read_csv(test_file)
df_test = df_test[pd.notnull(df_test['fraud'])]

y_test = df_test['fraud']
X_test = df_test.drop(['fraud'], axis=1)


### Predict with best model

In [34]:
ypred = best_model.predict(X_test)
cm = confusion_matrix(y_test, ypred)

pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

Unnamed: 0,0,1
0,613,85
1,53,146


## Model Registration


In [38]:
model_name = "airbnb-boston-hyperparameter"

#description = "logistic_regression model on Airbnb boston to predict fraud listing"
#registered_model = best_run.register_model(model_name=model_name)
registered_model = best_run.register_model(model_name=model_name,model_path="outputs")

print("registered model: ",registered_model)


registered model:  Model(workspace=Workspace.create(name='quick-starts-ws-239768', subscription_id='510b94ba-e453-4417-988b-fbdc37b55ca7', resource_group='aml-quickstarts-239768'), name=airbnb-boston-hyperparameter, id=airbnb-boston-hyperparameter:1, version=1, tags={}, properties={})


## Cleanup Resources

In [None]:
compute_target.delete()