Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

In [None]:
import os
import urllib
import shutil
import azureml

from azureml.core import Experiment
from azureml.core import Workspace, Run
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive import GridParameterSampling
from azureml.train.hyperdrive import HyperDriveConfig
from azureml.train.hyperdrive import PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import choice

from azureml.widgets import RunDetails

In [None]:
ws = Workspace.from_config()

In [None]:
exp = Experiment(workspace = ws, name = "house_prices_prediction")

In [None]:
ds = ws.get_default_datastore()

In [None]:
ds.upload(src_dir = "./data", target_path = "data", overwrite = True, show_progress = True)

In [None]:
cluster_name = "compute01"

try:
    compute_target = ComputeTarget(workspace = ws, name = cluster_name)
    print("Found existing compute target")
except ComputeTargetException:
    print("Creating a new compute target...")
    compute_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_DS12_V2", 
                                                           max_nodes = 6)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)

In [None]:
script_folder = "./scripts"

script_params = {
    "--data-folder": ds.as_mount()
}

estimator = SKLearn(source_directory = script_folder, 
                    compute_target = compute_target,
                    entry_script = "train_model.py",
                    script_params = script_params
                   )

In [None]:
param_sampling = GridParameterSampling({
    "n-estimators": choice(500, 750, 1000),
    "max-depth": choice(4, 6),
    "min-samples-split": choice(2, 4),
    "learning-rate": choice(0.01, 0.001)
})

In [None]:
hyperdrive_run_config = HyperDriveConfig(estimator = estimator,
                                         hyperparameter_sampling = param_sampling,
                                         policy = None,
                                         primary_metric_name = "test_MAE",
                                         primary_metric_goal = PrimaryMetricGoal.MINIMIZE,
                                         max_total_runs = 100,
                                         max_concurrent_runs = 24)

In [None]:
hyperdrive_run = exp.submit(hyperdrive_run_config)

In [None]:
RunDetails(hyperdrive_run).show()

In [None]:
import pandas as pd

children = list(hyperdrive_run.get_children())
metricslist = {}
i = 0

for single_run in children:
    results = {k: v for k, v in single_run.get_metrics().items() if isinstance(v, float)}
    parameters = single_run.get_details()["runDefinition"]["arguments"]
    try:
        results["n-estimators"] = parameters[3]
        results["max-depth"] = parameters[5]
        results["min-samples-split"] = parameters[7]
        results["learning-rate"] = parameters[9]
    except:
        results["n-estimators"] = "N/A"
        results["max-depth"] = "N/A"
        results["min-samples-split"] = "N/A"
        results["learning-rate"] = "N/A"
    metricslist[i] = results
    i += 1

rundata = pd.DataFrame(metricslist).sort_index(1).T.sort_values(by = ["test_MAE"], 
                                                                ascending = True)
display(rundata)