# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import os
import shutil

from azureml.widgets import RunDetails
from azureml.core import ScriptRunConfig
from azureml.core import Environment
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive.parameter_expressions import choice

# Create Experiment

Create experiment to run HyperDrive

In [2]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()

experiment = Experiment(
      workspace=ws, 
      name="creditcardfraud-hyperdrive")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging(snapshot_directory=None)



Workspace name: quick-starts-ws-137161
Azure region: southcentralus
Subscription id: b968fb36-f06a-4c76-a15f-afab68ae7667
Resource group: aml-quickstarts-137161


## Dataset

### Overview
The datasets contains transactions made by credit cards in September 2013 by european cardholders.

This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

The dataset was downloaded from [kaggle](https://www.kaggle.com/mlg-ulb/creditcardfraud) and a copy from the dataset was saved into the [git repository](https://github.com/heber-augusto/Nanodegree_Azure_ML_Engineer_CapstoneProject/tree/master/data).


In [3]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "creditcard"
description_text = "Creditcard dataset"
label = "Class"
if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        print("download data from https://www.kaggle.com/mlg-ulb/creditcardfraud/download and create a dataset")
        # Create AML Dataset and register it into Workspace
        #example_data = 'https://raw.githubusercontent.com/heber-augusto/Nanodegree_Azure_ML_Engineer_CapstoneProject/master/data/creditcard.csv.zip'
        #dataset = Dataset.Tabular.from_delimited_files(example_data)
        #Register Dataset in Workspace
        #dataset = dataset.register(workspace=ws,
        #                           name=key,
        #                           description=description_text)        


df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284806.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.841366,3.918649e-15,5.682686e-16,-8.761736e-15,2.811118e-15,-1.552103e-15,2.04013e-15,-1.698953e-15,-1.958151e-16,-3.14764e-15,...,1.471982e-16,8.042109e-16,5.28245e-16,4.458267e-15,1.426896e-15,1.70164e-15,-3.671606e-16,-1.218152e-16,88.349619,0.001727
std,47488.22833,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.25,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84691.5,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.75,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [4]:
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### Create cluster to run Hyperdrive

In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "cpu-cluster-hd"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                           min_nodes=1,
                                                           max_nodes=6)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

# can poll for a minimum number of nodes and for a specific timeout. 
# if no min node count is provided it uses the scale settings for the cluster
compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing compute target
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 1, 'targetNodeCount': 1, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 1, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-02-04T01:41:35.878000+00:00', 'errors': None, 'creationTime': '2021-02-04T01:26:51.620499+00:00', 'modifiedTime': '2021-02-04T01:27:07.716282+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 1, 'maxNodeCount': 6, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


### Create folder to copy script and csv file

In [6]:
import os
project_folder = './fraud-detection-hyperdrive'
os.makedirs(project_folder, exist_ok=True)

In [7]:
shutil.copy('train.py', project_folder)
shutil.copy('creditcard.csv', project_folder)

'./fraud-detection-hyperdrive/creditcard.csv'

### Create an environment

Define a conda environment YAML file with your training script dependencies and create an Azure ML environment.

In [8]:
%%writefile conda_dependencies.yml

dependencies:
- python=3.6.2
- scikit-learn
- pip:
  - azureml-defaults

Overwriting conda_dependencies.yml


In [10]:
sklearn_env = Environment.from_conda_specification(name = 'sklearn-env', file_path = './conda_dependencies.yml')

### Configure the training job

Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on.
The firts run is a test to evaluate if everything is correct with the train.py script

In [11]:
src = ScriptRunConfig(source_directory=project_folder,
                      script='train.py',
                      arguments=['--n_estimators', 100, '--learning_rate', 0.1, '--max_depth', 3],
                      compute_target=compute_target,
                      environment=sklearn_env)

### Submit job

Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous.

In [12]:
run = experiment.submit(src)

Submitting /mnt/batch/tasks/shared/LS_root/mounts/clusters/notebookrun-hd/code/Users/odl_user_137161/fraud-detection-hyperdrive directory for run. The size of the directory >= 25 MB, so it can take a few minutes.


### Monitor your run

You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes.

In [13]:
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [14]:
run.wait_for_completion(show_output=True)

RunId: creditcardfraud-hyperdrive_1612403441_58f978db
Web View: https://ml.azure.com/experiments/creditcardfraud-hyperdrive/runs/creditcardfraud-hyperdrive_1612403441_58f978db?wsid=/subscriptions/b968fb36-f06a-4c76-a15f-afab68ae7667/resourcegroups/aml-quickstarts-137161/workspaces/quick-starts-ws-137161

Streaming azureml-logs/55_azureml-execution-tvmps_983dcf4ded41e9d6cd613c5244b03b28316c184758171d72d4956c44f0dbcb95_d.txt

2021-02-04T01:50:59Z Starting output-watcher...
2021-02-04T01:50:59Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
2021-02-04T01:51:00Z Executing 'Copy ACR Details file' on 10.0.0.5
2021-02-04T01:51:00Z Copy ACR Details file succeeded on 10.0.0.5. Output: 
>>>   
>>>   
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_cdd6808b1f0a997b9152a522618a461d
Digest: sha256:6aa4829eee3a4196d8a55b61faf522640a6a78f5524d16823faba7fd8aba66bf
Status: Image is up to date for a2b308372e5245509fc736e14a0952dd.azurecr.io/azureml/azureml_c

{'runId': 'creditcardfraud-hyperdrive_1612403441_58f978db',
 'target': 'cpu-cluster-hd',
 'status': 'Completed',
 'startTimeUtc': '2021-02-04T01:50:58.176956Z',
 'endTimeUtc': '2021-02-04T01:56:17.915841Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '2bb77bd7-ec8b-425f-86ce-573c5f5b5293',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'train.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--n_estimators',
   '100',
   '--learning_rate',
   '0.1',
   '--max_depth',
   '3'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cpu-cluster-hd',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 1,
  'priority': None,
  'credentialPassthrough': False,
  'environment': {'

## Hyperdrive Configuration

The model used with HyperDrive is GradientBoostingClassifier. This model was one with best results with AutoML.

The hyperparametes used here where the number of estimators, the lerning rate and the max depth. This are some of the most important hyperparameters from this type of model. 

Other hyperparameters like min_samples_split, min_samples_leaf and max_features could be used inside a future work.

The termination policy was BanditPolicy. One of the benefit for the early stopping policy Bandit is that it is more suitable for resource savings.

The AUC_weighted was set as a primary metric to compare with HyperDrive Run. This metric is more suitable for imbalanced dataset (which is  common with fraud detection datasets).

In [15]:
# Specify a Policy
# The BanditPolicy basically states to check the job every 2 iterations. 
# If the primary metric (defined later) falls outside of the top 10% range, Azure ML 
# terminate the job. This saves us from continuing to explore hyperparameters that don't 
# show promise of helping reach our target metric.
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

# Specify parameter sampler
param_sampling = RandomParameterSampling( {
    'n_estimators': choice(100, 200, 500),
    'learning_rate': uniform(0.1, 1.0),
    'max_depth': choice(1, 3, 5)
    }
)

# Create a SKLearn estimator for use with train.py
# estimator = ### YOUR CODE HERE ###
# I decided not to use the estimator because the documentation (https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.sklearn.sklearn) says that it is deprecated

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_run_config = HyperDriveConfig(
    run_config=src,
    hyperparameter_sampling=param_sampling,
    policy=policy,
    primary_metric_name='AUC_weighted',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=20,
    max_concurrent_runs=4)





In [16]:
# Submit hyperdrive run to the experiment and show run details with the widget.

# start the HyperDrive run
hyperdrive_run = experiment.submit(hyperdrive_run_config)

## Run Details

`RunDetails` widget to show the different experiments.

In [17]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [18]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_7763566c-db39-4a22-ac69-6ff0c759a07e
Web View: https://ml.azure.com/experiments/creditcardfraud-hyperdrive/runs/HD_7763566c-db39-4a22-ac69-6ff0c759a07e?wsid=/subscriptions/b968fb36-f06a-4c76-a15f-afab68ae7667/resourcegroups/aml-quickstarts-137161/workspaces/quick-starts-ws-137161

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-02-04T01:57:01.216715][API][INFO]Experiment created<END>\n""<START>[2021-02-04T01:57:01.916610][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-02-04T01:57:02.391080][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-02-04T01:57:02.5810036Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_7763566c-db39-4a22-ac69-6ff0c759a07e
Web View: https://ml.azure.com/experiments/creditcardfraud-hyperdrive/runs/HD_7763566c-db39-4a22-ac69-6ff0c759a07e?wsi

{'runId': 'HD_7763566c-db39-4a22-ac69-6ff0c759a07e',
 'target': 'cpu-cluster-hd',
 'status': 'Completed',
 'startTimeUtc': '2021-02-04T01:57:00.924715Z',
 'endTimeUtc': '2021-02-04T03:18:17.299342Z',
 'properties': {'primary_metric_config': '{"name": "AUC_weighted", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '2bb77bd7-ec8b-425f-86ce-573c5f5b5293',
  'score': '0.8787187285928012',
  'best_child_run_id': 'HD_7763566c-db39-4a22-ac69-6ff0c759a07e_2',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg137161.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_7763566c-db39-4a22-ac69-6ff0c759a07e/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=SuyqnKGsIDvgJZkAEHQd%2FxQbJxN2XVjMdlDIE9HHb%2FM%3D&st=2021-02-04T03%3A08%3A58Z&se=2021-02-04T11%3A18%3A58Z&sp=r'},
 'submittedBy': 'ODL_User 13

In [19]:
assert(hyperdrive_run.get_status() == "Completed")

## Best Model

Get the best model from the hyperdrive experiments and display all the properties of the model.


In [20]:
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from train import clean_data
from sklearn.model_selection import train_test_split

# Clean the data using the dataset variable already created
x, y = clean_data(dataset.to_pandas_dataframe())

# Split data into train and test sets.
x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.33, random_state=42)

best_run = hyperdrive_run.get_best_run_by_primary_metric()
arguments = best_run.get_details()['runDefinition']['arguments']
print(arguments)
print(best_run.get_file_names())

model = GradientBoostingClassifier(
    n_estimators=int(arguments[1]), 
    learning_rate=float(arguments[3]), 
    max_depth=int(arguments[5])).fit(x_train, y_train)

model.fit(x_train, y_train)
filename = 'best_model_hyperdrive.sav'

joblib.dump(model, filename)

['--n_estimators', '100', '--learning_rate', '0.1', '--max_depth', '3', '--learning_rate', '0.7165541203459936', '--max_depth', '3', '--n_estimators', '100']
['azureml-logs/55_azureml-execution-tvmps_c82220580a6680a6c89534591ede7a5d077aa8f0e43633703eb895c7c29ef891_d.txt', 'azureml-logs/65_job_prep-tvmps_c82220580a6680a6c89534591ede7a5d077aa8f0e43633703eb895c7c29ef891_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_c82220580a6680a6c89534591ede7a5d077aa8f0e43633703eb895c7c29ef891_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/105_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log']


['best_model_hyperdrive.sav']

In [21]:

from sklearn.metrics import confusion_matrix
import pandas as pd
ypred = model.predict(x_test)
cm = confusion_matrix(y_test, ypred)
# Visualize the confusion matrix
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

Unnamed: 0,0,1
0,93813,18
1,82,73


## Model Deployment

AutoML model was deployed because it returned better results than HyperDrive.

## Cleanup compute target

In [42]:
compute_target.delete()

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

