# Hyperparameter Tuning using HyperDrive

In [18]:
from azureml.core import Workspace, Experiment
from azureml.core.compute_target import ComputeTargetException
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.workspace import Workspace, Dataset
from azureml.train.automl import AutoMLConfig
import joblib
from azureml.widgets import RunDetails
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, randint
from azureml.core import Environment, ScriptRunConfig
import os

In [7]:
ws = Workspace.from_config("config.json")
exp = Experiment(workspace=ws, name="banks_analysis_tuning")

print('Workspace name: ' + ws.name,
      'Azure region: ' + ws.location,
      'Subscription id: ' + ws.subscription_id,
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-238537
Azure region: southcentralus
Subscription id: 5a4ab2ba-6c51-4805-8155-58759ad589d8
Resource group: aml-quickstarts-238537


## Dataset

In [8]:
found = False
key = "heart_failure_data"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:

        datastore = ws.get_default_datastore()

        datastore.upload_files(files=["./heart_failure_clinical_records_dataset.csv"],
                                overwrite=True)

        # Create AML Dataset and register it into Workspace
        example_data = "heart_failure_clinical_records_dataset.csv"
        dataset = Dataset.Tabular.from_delimited_files(path=(datastore, "heart_failure_clinical_records_dataset.csv"))       
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key)


df = dataset.to_pandas_dataframe()
df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [13]:
experiment_name = 'hyperdrive_exp'

experiment=Experiment(ws, experiment_name)

In [14]:
cluster_name = 'cluster'
cluster_min_nodes = 0
cluster_max_nodes = 4
cluster_sku = 'STANDARD_D2_V2'

# create the compute cluster if it does not already exist
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print(f'Found existing compute target {cluster_name}.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size=cluster_sku, min_nodes=cluster_min_nodes, max_nodes=cluster_max_nodes)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True)
    print(f'Compute target {cluster_name} has been created.')

Found existing compute target cluster.


## Hyperdrive Configuration

We utilize sklearn's LogisticRegression Class to establish and fit the model. Additionally, we define a parameter sampler to tune the hyperparameters, specifically the inverse regularization parameter (C) and maximum number of iterations (max_iter). Thereby random parameter sampling was used, because it is an efficient, exploratory, and parallelizable method for hyperparameter tuning, which can potentially find better hyperparameters than other search methods. It is also robust to noise and other sources of variability in the training process. Additionally, a bandit policy was applied, since it is a popular early termination policy used in hyperparameter tuning that aims to save computational resources by terminating poorly performing runs early.

With the estimator, parameter sampler, and an early termination policy, we create a HyperDrive Config, which is subsequently submitted as an experiment.

In [47]:
# Specify parameter sampler
param_sampling = RandomParameterSampling({
    '--C': uniform(0.01, 1.0),
    '--max_iter': randint(100)
})

# Specify a Policy
early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=5)

# Setup environment for the training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create estimator and hyperdrive config
estimator = ScriptRunConfig(source_directory="./", environment=sklearn_env, compute_target = "cluster", script="train.py")

hyperdrive_run_config = HyperDriveConfig(run_config=estimator, policy=early_termination_policy, primary_metric_name="accuracy", hyperparameter_sampling=param_sampling, max_total_runs=20, primary_metric_goal=PrimaryMetricGoal.MAXIMIZE)

In [48]:
#Submit experiment

hyperdrive_run = experiment.submit(hyperdrive_run_config)

## Run Details

In [49]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model

In [51]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['accuracy'])
print('\n Regularization Strength:',best_run_metrics['Regularization Strength:'])
print('\n Max Iterations:',best_run_metrics['Max iterations:'])

Best Run Id:  HD_54a2f904-ad43-465e-8e2d-91608bb4e991_5

 Accuracy: 0.8

 Regularization Strength: 0.3108354647015979

 Max Iterations: 96


In [55]:
# Code below registers the best model with the information of Metrics
model = best_run.register_model(model_name='best_logistic_regression', model_path='trained_model.pkl',
                                properties={'Accuracy': best_run_metrics['accuracy'],
                                            'Regularization Strength': best_run_metrics['Regularization Strength:'],
                                           'Max Iterations': best_run_metrics['Max iterations:']})