# Tuning Hyperparameters

## Connect to Your Workspace

In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.21.0 to work with mymachinelearningws


## Prepare Data for an Experiment

In [2]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'diabetes dataset' not in ws.datasets:
    default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data
                        target_path='diabetes-data/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='diabetes dataset',
                                description='diabetes data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Dataset already registered.


## Prepare a Training Script

In [3]:
import os

experiment_folder = 'diabetes_training-hyperdrive'
os.makedirs(experiment_folder, exist_ok=True)

print('Folder ready.')

Folder ready.


In [4]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import argparse
import joblib
import os
from azureml.core import Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Set regularization parameter
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')
args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
print("Loading Data...")
diabetes = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Writing diabetes_training-hyperdrive/diabetes_training.py


## Prepare a Compute Target

In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "aml-clusters3004"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


## Run a Hyperdrive Experiment

In [6]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.widgets import RunDetails

# Create a Python environment for the experiment
sklearn_env = Environment("sklearn-env")

# Ensure the required packages are installed (we need scikit-learn, Azure ML defaults, and Azure ML dataprep)
packages = CondaDependencies.create(pip_packages=['scikit-learn','azureml-defaults','azureml-dataprep[pandas]'])
sklearn_env.python.conda_dependencies = packages

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                              script='diabetes_training.py',
                              arguments = ['--regularization', 0.1, # Regularizaton rate parameter
                                           '--input-data', diabetes_ds.as_named_input('training_data')], # Reference to dataset
                              environment=sklearn_env,
                              compute_target = training_cluster)

# Sample a range of parameter values
params = GridParameterSampling(
    {
        # There's only one parameter, so grid sampling will try each value - with multiple parameters it would try every combination
        '--regularization': choice(0.001, 0.005, 0.01, 0.05, 0.1, 1.0)
    }
)

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config=script_config, 
                          hyperparameter_sampling=params, 
                          policy=None, 
                          primary_metric_name='AUC', 
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                          max_total_runs=6,
                          max_concurrent_runs=4)

# Run the experiment
experiment = Experiment(workspace = ws, name = 'diabates_training_hyperdrive')
run = experiment.submit(config=hyperdrive)

# Show the status in the notebook as the experiment runs
RunDetails(run).show()
run.wait_for_completion()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

{'runId': 'HD_8e80815e-11b8-4cd7-9278-36daa41b41c3',
 'target': 'aml-clusters3004',
 'status': 'Completed',
 'startTimeUtc': '2021-01-30T16:49:25.23406Z',
 'endTimeUtc': '2021-01-30T17:02:56.069827Z',
 'properties': {'primary_metric_config': '{"name": "AUC", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '69a43946-e466-4f43-bde5-dee4ee9caa02',
  'score': '0.8569267767414822',
  'best_child_run_id': 'HD_8e80815e-11b8-4cd7-9278-36daa41b41c3_5',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mymachinelearn7523317901.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_8e80815e-11b8-4cd7-9278-36daa41b41c3/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=M5wp6728i5TwaIdR2M8K3g8t5jB4NlmGZdL6htk9Smk%3D&st=2021-01-30T16%3A53%3A01Z&se=2021-01-31T01%3A03%3A01Z&sp=r'},
 'submittedBy': 'Tissana Tan

## Determine the Best Performing Run

In [7]:
for child_run in run.get_children_sorted_by_primary_metric():
    print(child_run)

best_run = run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details() ['runDefinition']['arguments']

print('Best Run Id: ', best_run.id)
print(' -AUC:', best_run_metrics['AUC'])
print(' -Accuracy:', best_run_metrics['Accuracy'])
print(' -Regularization Rate:',parameter_values)

{'run_id': 'HD_8e80815e-11b8-4cd7-9278-36daa41b41c3_5', 'hyperparameters': '{"--regularization": 1.0}', 'best_primary_metric': 0.8569267767414822, 'status': 'Completed'}
{'run_id': 'HD_8e80815e-11b8-4cd7-9278-36daa41b41c3_4', 'hyperparameters': '{"--regularization": 0.1}', 'best_primary_metric': 0.8568646196561496, 'status': 'Completed'}
{'run_id': 'HD_8e80815e-11b8-4cd7-9278-36daa41b41c3_3', 'hyperparameters': '{"--regularization": 0.05}', 'best_primary_metric': 0.8568593108659434, 'status': 'Completed'}
{'run_id': 'HD_8e80815e-11b8-4cd7-9278-36daa41b41c3_2', 'hyperparameters': '{"--regularization": 0.01}', 'best_primary_metric': 0.8568573200696159, 'status': 'Completed'}
{'run_id': 'HD_8e80815e-11b8-4cd7-9278-36daa41b41c3_1', 'hyperparameters': '{"--regularization": 0.005}', 'best_primary_metric': 0.8568279005238891, 'status': 'Completed'}
{'run_id': 'HD_8e80815e-11b8-4cd7-9278-36daa41b41c3_0', 'hyperparameters': '{"--regularization": 0.001}', 'best_primary_metric': 0.856825688527969

In [8]:
from azureml.core import Model

# Register model
best_run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
                        tags={'Training context':'Hyperdrive'},
                        properties={'AUC': best_run_metrics['AUC'], 'Accuracy': best_run_metrics['Accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

diabetes_model version: 10
	 Training context : Hyperdrive
	 AUC : 0.8569267767414822
	 Accuracy : 0.7891111111111111


diabetes_model version: 9
	 Training context : Inline Training
	 AUC : 0.8760833872331957
	 Accuracy : 0.888


diabetes_model version: 8
	 Training context : Inline Training
	 AUC : 0.8795487337678491
	 Accuracy : 0.8903333333333333


diabetes_model version: 7
	 Training context : Inline Training
	 AUC : 0.8768744713667346
	 Accuracy : 0.8906666666666667


diabetes_model version: 6
	 Training context : Pipeline
	 AUC : 0.8849151611085108
	 Accuracy : 0.8995555555555556


diabetes_model version: 5
	 Training context : Compute cluster
	 AUC : 0.8568336517132793
	 Accuracy : 0.7891111111111111


diabetes_model version: 4
	 Training context : File dataset
	 AUC : 0.8568517900798176
	 Accuracy : 0.7891111111111111


diabetes_model version: 3
	 Training context : Tabular dataset
	 AUC : 0.8568595320655352
	 Accuracy : 0.7891111111111111


diabetes_model version: 2
	 Trainin