# Hyperparameter Tuning using HyperDrive

In [1]:
import os

from azureml.core import Workspace, Experiment

from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

## Dataset

The dataset in use for this project is the Kaggle credit card fraud dataset. (https://www.kaggle.com/mlg-ulb/creditcardfraud)

This dataset includes features that are the principal coordinates that result from applying PCA to the underlying dataset. PCA was carried out to remove any sensitive data. In this experiment, I am attempting to identify fraudulent credit card transactions from the given data. This is a particularly interesting challenge, as instances of fraud make up only 0.17% of all transactions in the dataset, meaning classes are highly imbalanced.

To load the dataset using this notebook, it must be registered with the name 'cc-fraud'. Under registered datasets upload the csv file and select 'Get headers from the first file'.

In [2]:
ws = Workspace.from_config()
experiment_name = 'capstone-hd'

experiment = Experiment(ws, experiment_name)

# Retrieve data from datasets
key = "cc-fraud"
description_text = "Credit card fraud dataset from Kaggle."

try:
    dataset = ws.datasets[key]
except:
    print("Dataset not found in Azure datasets.")

In [3]:
amlcompute_cluster_name = "capstone"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 2)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Hyperdrive Configuration

For the model to tune with hyperdrive I selected a Random Forest classifier from scikit-learn. Random forests are really powerful and two of the main hyperparameters that determine performance are the number of trees included in the forest (n_estimators) and the maximum depth of trees in the forest (max_depth).

To run Hyperdrive I used the newer 'ScriptRunConfig' approach. Using this approach you first define an environment used during training from the provided 'conda_depencies.yml'. Next, the dataset is passed as a named input to the training script (train.py). I chose the Bandit policy for an early termination policy in order to use the current top model metrics as a comparison point for stopping. I selected random parameter sampling, as the range of expected values for the two hyperparameters that are searched is rather large, so random sampling should relatively evenly search the parameter space.

In [4]:
from azureml.core import Environment

sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='./conda_dependencies.yml')

In [5]:
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig 
from azureml.train.hyperdrive.parameter_expressions import quniform

src = ScriptRunConfig(source_directory=".",
                        script='train.py',
                        arguments=['--input_data', dataset.as_named_input('ccfraud')],
                        compute_target=compute_target,
                        environment=sklearn_env)

early_termination_policy = BanditPolicy(slack_factor=0.2, evaluation_interval=1, delay_evaluation=5)

param_sampling = RandomParameterSampling({
    "--n_estimators": quniform(10,1000,1),
    "--max_depth": quniform(0,100,1) })


hyperdrive_run_config = HyperDriveConfig(run_config=src,
    hyperparameter_sampling=param_sampling,
    policy=early_termination_policy,
    primary_metric_name="auc",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=20,
    max_concurrent_runs=4)

In [6]:
hyperdrive_run = experiment.submit(hyperdrive_run_config)

## Run Details

In [7]:
from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model

In [12]:
hyperdrive_run.get_children_sorted_by_primary_metric(top=1)


[{'run_id': 'HD_7074c07e-5f36-4d6d-8060-ab1551527ec0_1',
  'hyperparameters': '{"--max_depth": 16.0, "--n_estimators": 716.0}',
  'best_primary_metric': 0.992253764874005,
  'status': 'Completed'}]

In [None]:
# Register best model
model = hyperdrive_run.register_model(model_name='capstonehd')

In [None]:
# Clean up and shut down compute cluster
compute_target.delete()