# Tunning Hyperparameters

### Run a Hyperdrive Experiment in an Azureml environment

In [1]:
# import the Azure ML classes
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.38.0 to work with Azureml-SDK-WS02


### Prepare data

In [2]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

# Get the input dataset

if 'Defaults' not in ws.datasets:
    default_ds.upload_files( files = ['defaults.csv'],
                           target_path = 'defaults/',
                           overwrite = True,
                           show_progress = True)
    
    tab_data_set = Dataset.Tabular.from_delimited_files(path = (default_ds, 'defaults/*.csv'))
    
    try:
        tab_data_set = tab_data_set.register(
            workspace = ws,
            name = 'Defaults',
            description = 'defaults data',
            tags = {'format': 'CSV'},
            create_new_version = True
        )
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered')

Dataset already registered


### Training Script for the Hyperdrive job

In [3]:
import os

experiment_folder = 'defaults_training-hyperdrive'
os.makedirs(experiment_folder, exist_ok=True)

print('Folder ready.')

Folder ready.


In [4]:
%%writefile $experiment_folder/hyperdrive_script.py
from azureml.core import Run
import pandas as pd
import numpy as np
import argparse, joblib, os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# Get the experiment run context
new_run = Run.get_context()

# Get the workspace from the run
# ws = new_run.experiment.workspace

# get the script arguments
parser = argparse.ArgumentParser()

# input dataset
parser.add_argument('--input-data', type = str)

# Hyperparameters
parser.add_argument('--n_estimators', type = int)
parser.add_argument('--min_samples_leaf', type = int)

# Add arguments to args collection
args = parser.parse_args()

ne = args.n_estimators
msl = args.min_samples_leaf

# Log Hyperparameters values
new_run.log('n_estimators', np.int( ne ) )
new_run.log('min_samples_leaf', np.int( msl))

# Making the model
df = new_run.input_datasets['raw_data'].to_pandas_dataframe()

# Select columns from the dataset
dataPrep = df.drop(['ID'], axis = 1)

# Clean Missing Data - Drop the columns with missing values
dataPrep = dataPrep.dropna()

# Create Dummy variables 
dataPrep = pd.get_dummies(dataPrep, drop_first = True)

# Create X and Y
y = dataPrep['Default Next Month'].values
X = dataPrep.drop(['Default Next Month'], axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234, stratify = y)

# Build the Random Forest model
rfc = RandomForestClassifier(n_estimators = ne, min_samples_leaf = msl)

# Fit the data to the Random Forest object - Train Model
rfc.fit(X_train, y_train)

# Predict the outcome using Test data - Score Model
y_predict = rfc.predict(X_test)

# Get confusion matrix and the accuracy/Score - Evaluate
cm = confusion_matrix(y_test, y_predict)
score = rfc.score(X_test, y_test)

new_run.log('accuracy', np.float( score ) )

# Save the model in the run outputs
os.makedirs('outputs', exist_ok = True)
joblib.dump(value = rfc, filename = 'outputs/defaults_model.pkl')

new_run.complete()

Overwriting defaults_training-hyperdrive/hyperdrive_script.py


### Create the compute cluster

In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = 'my-cluster-001'

try:
    training_cluster = ComputeTarget(workspace = ws, name = cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_DS11_V2', max_nodes = 2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output = True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


### Create custom environment

In [6]:
%%writefile $experiment_folder/hyperdrive_env.yml
name: batch_environment
dependencies:
- python=3.6.2
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults

Overwriting defaults_training-hyperdrive/hyperdrive_env.yml


In [7]:
from azureml.core import Environment
from azureml.core.environment import CondaDependencies

myenv = Environment.from_conda_specification('MyEnvironment', experiment_folder + '/hyperdrive_env.yml')

### Get the training dataset

In [8]:
input_ds = ws.datasets.get('Defaults')
input_ds

{
  "source": [
    "('workspaceblobstore', 'defaults/defaults.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "7f950f96-1ef7-48e9-b9d6-2406b4dbad21",
    "name": "Defaults",
    "version": 1,
    "description": "defaults dataset",
    "tags": {
      "format": "CSV"
    },
    "workspace": "Workspace.create(name='Azureml-SDK-WS02', subscription_id='f0ec0447-a406-4c0a-922d-f468c99bce13', resource_group='AzuremlSDKRG01')"
  }
}

### Create a script configuration for custom environment of myenv

In [9]:
from azureml.core import ScriptRunConfig

script_config = ScriptRunConfig(source_directory = experiment_folder,
                               script = 'hyperdrive_script.py',
                               arguments = ['--input-data', input_ds.as_named_input('raw_data')],
                               environment = myenv,
                               compute_target = training_cluster)
script_config

<azureml.core.script_run_config.ScriptRunConfig at 0x7fad34433390>

### Create Hyper drive parameters

In [10]:
from azureml.train.hyperdrive import GridParameterSampling, choice

hyper_params = GridParameterSampling(
    {
        '--n_estimators': choice(10, 20, 50, 100),
        '--min_samples_leaf': choice(1, 2, 5)
    }
)

### Configure the Hyperdrive class

In [11]:
from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal

hyper_config = HyperDriveConfig(run_config = script_config,
                                hyperparameter_sampling = hyper_params,
                                policy = None,
                                primary_metric_name = 'accuracy',
                                primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                                max_total_runs=20,
                                max_concurrent_runs=2)


In [12]:
from azureml.widgets import RunDetails

# Create the experiment and run
new_experiment = Experiment(workspace = ws, name = 'Hyperdrive_Exp001')
new_run = new_experiment.submit(config = hyper_config)

RunDetails(new_run).show()
new_run.wait_for_completion(show_output = True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_62ee9918-035f-4222-b6ff-3c2740807d04
Web View: https://ml.azure.com/runs/HD_62ee9918-035f-4222-b6ff-3c2740807d04?wsid=/subscriptions/f0ec0447-a406-4c0a-922d-f468c99bce13/resourcegroups/AzuremlSDKRG01/workspaces/Azureml-SDK-WS02&tid=f94bf4d9-8097-4794-adf6-a5466ca28563

Streaming azureml-logs/hyperdrive.txt

"<START>[2022-03-30T18:52:47.937937][API][INFO]Experiment created<END>\n""<START>[2022-03-30T18:52:48.648088][GENERATOR][INFO]Trying to sample '2' jobs from the hyperparameter space<END>\n""<START>[2022-03-30T18:52:49.233901][GENERATOR][INFO]Successfully sampled '2' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_62ee9918-035f-4222-b6ff-3c2740807d04
Web View: https://ml.azure.com/runs/HD_62ee9918-035f-4222-b6ff-3c2740807d04?wsid=/subscriptions/f0ec0447-a406-4c0a-922d-f468c99bce13/resourcegroups/AzuremlSDKRG01/workspaces/Azureml-SDK-WS02&tid=f94bf4d9-8097-4794-adf6-a5466ca28563



{'runId': 'HD_62ee9918-035f-4222-b6ff-3c2740807d04',
 'target': 'my-cluster-001',
 'status': 'Completed',
 'startTimeUtc': '2022-03-30T18:52:47.695603Z',
 'endTimeUtc': '2022-03-30T19:06:05.795677Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name": "accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '946b1c3b-97cb-423d-bb86-2d81442eb800',
  'user_agent': 'python/3.6.9 (Linux-5.4.0-1068-azure-x86_64-with-debian-buster-sid) msrest/0.6.21 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.38.0',
  'space_size': '12',
  'score': '0.796137339055794',
  'best_child_run_id': 'HD_62ee9918-035f-4222-b6ff-3c2740807d04_8',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://azuremlsstorage858c34f0b.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_62ee9918-035f-4222-b6ff-3c274080

In [13]:
best_run = new_run.get_best_run_by_primary_metric()
print('Best RunID : ', best_run.id)
print(best_run.get_metrics())

Best RunID :  HD_62ee9918-035f-4222-b6ff-3c2740807d04_8
{'n_estimators': 50, 'min_samples_leaf': 5, 'accuracy': 0.796137339055794}


In [19]:
from azureml.core import Model

# Register model
best_run.register_model(model_path = 'outputs/defaults_model.pkl', model_name='defaults_model',
                        tags = {'Training context':'Hyperdrive'},
                        properties={'Accuracy': best_run.get_metrics()['accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

defaults_model version: 3
	 Training context : Hyperdrive
	 Accuracy : 0.796137339055794


defaults_model version: 2
	 Training context : Hyperdrive 1
	 Accuracy : 0.796137339055794


defaults_model version: 1
	 Training context : Hyperdrive
	 Accuracy : 0.796137339055794


diabetes_model version: 3
	 Training context : Hyperdrive
	 AUC : 0.9885804604667666
	 Accuracy : 0.9457777777777778


diabetes_model version: 2
	 Training context : Hyperdrive
	 AUC : 0.9885804604667666
	 Accuracy : 0.9457777777777778


diabetes_model version: 1
	 Training context : Auto ML
	 AUC : 0.9904812577250306
	 Accuracy : 0.9520809898762654


