In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-127467")
exp = Experiment(workspace=ws, name="bank-marketing-hyperdrive-2")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code EF55HDCXS to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-127467
Azure region: southcentralus
Subscription id: 94e14ad4-bf97-47e8-aae0-f9b85a7befa8
Resource group: aml-quickstarts-127467


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
cpu_cluster_name = "simba-cluster"

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Use existing cluster.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Use existing cluster.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [13]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
import os
import shutil

### YOUR CODE HERE ###
# Specify parameter sampler
ps = RandomParameterSampling({
    "--C": uniform(0.001, 1.0),
    "--max_iter": choice(50, 100, 200)
})

### YOUR CODE HERE ###
# Specify a Policy for early stopping
es_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

### YOUR CODE HERE ###
# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory='.',entry_script='train.py',compute_target=cpu_cluster)

### YOUR CODE HERE ###
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                                    hyperparameter_sampling=ps,
                                    policy=es_policy,
                                    primary_metric_name="Accuracy",
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=4,
                                    max_concurrent_runs=4)

In [14]:
# Submit your hyperdrive run to the experiment and show run details with the RunDetails widget.

### YOUR CODE HERE ###
hdr = exp.submit(config=hyperdrive_config, show_output=True)
RunDetails(hdr).show()
hdr.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_c169f4e9-e4aa-46bb-8e58-f076f33de94a
Web View: https://ml.azure.com/experiments/bank-marketing-hyperdrive-2/runs/HD_c169f4e9-e4aa-46bb-8e58-f076f33de94a?wsid=/subscriptions/8e713106-916f-4177-890e-435b90d7adc4/resourcegroups/aml-quickstarts-127439/workspaces/quick-starts-ws-127439

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-11-23T00:44:36.954217][API][INFO]Experiment created<END>\n""<START>[2020-11-23T00:44:37.514762][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2020-11-23T00:44:37.673318][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2020-11-23T00:44:38.0698885Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_c169f4e9-e4aa-46bb-8e58-f076f33de94a
Web View: https://ml.azure.com/experiments/bank-marketing-hyperdrive-2/runs/HD_c169f4e9-e4aa-46bb-8e58-f076f33de94a?w

{'runId': 'HD_c169f4e9-e4aa-46bb-8e58-f076f33de94a',
 'target': 'simba-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-11-23T00:44:36.682588Z',
 'endTimeUtc': '2020-11-23T00:54:28.751307Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '992fbf64-73f8-400c-be2d-a0f85d704d25',
  'score': '0.909711684370258',
  'best_child_run_id': 'HD_c169f4e9-e4aa-46bb-8e58-f076f33de94a_0',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg127439.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_c169f4e9-e4aa-46bb-8e58-f076f33de94a/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=3ST0%2BZUfgxKQDXZDUxT%2Be0vJzmEKMTUOHeBmeJYl1d0%3D&st=2020-11-23T00%3A44%3A31Z&se=2020-11-23T08%3A54%3A31Z&sp=r'}}

In [15]:
import joblib
# Get your best run and save the model from that run. Use the .get_best_run_by_primary_metric() method of the run to select
# the best hyperparameters for your model

### YOUR CODE HERE ###
best_run = hdr.get_best_run_by_primary_metric()
print('Best Run Id: ', best_run.id)
best_run_metrics = best_run.get_metrics()
print(best_run_metrics)

model = best_run.register_model(model_name='hd-model',model_path='outputs/hd-model.joblib', tags={'Method':'Hyperdrive'}, properties={'Accuracy': best_run_metrics['Accuracy']})

cpu_cluster.delete()

Best Run Id:  HD_c169f4e9-e4aa-46bb-8e58-f076f33de94a_0
{'Regularization Strength:': 0.8858464414859172, 'Max iterations:': 200, 'Accuracy': 0.909711684370258}
Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"



In [8]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path, validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False)

In [13]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

print(x.shape)
print(y.shape)
# Split data into train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=36)

import pandas as pd
training_data = pd.concat((x_train,y_train),axis=1)
#training_data.head()
print(training_data.shape)
local_path='data/prepared.csv'
os.makedirs('data')
training_data.to_csv(local_path)
datastore=ws.get_default_datastore()
datastore.upload(src_dir='data',target_path='data')

from azureml.core import Dataset
training_dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, ('data/prepared.csv'))])
#from azureml.core import Dataset
#train_data = Dataset.from_pandas_dataframe(training_data)

(32950, 39)
(32950,)
(26360, 40)
Uploading an estimated of 1 files
Target already exists. Skipping upload for data/prepared.csv
Uploaded 0 files


In [16]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

### YOUR CODE HERE ###
cpu_cluster_name = "jacob-cluster"

try:
    cpu_cluster_automl = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Use existing cluster.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_D2_V2',
                                                           max_nodes=4)
    cpu_cluster_automl = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster_automl.wait_for_completion(show_output=True)

from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    compute_target=cpu_cluster_automl,
    primary_metric="accuracy",
    training_data=training_dataset,
    label_column_name="y",
    n_cross_validations=2,
    max_concurrent_iterations=4)

Use existing cluster.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [17]:
# Submit your automl run

### YOUR CODE HERE ###
from azureml.core.experiment import Experiment

experiment = Experiment(ws, "bank-marketing-automl")
run = experiment.submit(config=automl_config, show_output=True)
RunDetails(run).show()
run.wait_for_completion(show_output=True)

Running on remote.
Running on remote compute: jacob-cluster
Parent Run ID: AutoML_263d3151-5dc6-46d2-a5d3-f798006c5af8



In [20]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run, fitted_model = run.get_output()
print('Best Run Id: ', best_run.id)
print(best_run)
print(fitted_model)
best_run_metrics = best_run.get_metrics()
print(best_run_metrics)

model = best_run.register_model(model_name='bank-marketing-automl-model',model_path='outputs/model.pkl', tags={'Method':'AutoML'}, properties={'accuracy': best_run_metrics['accuracy']})
print(model)


Best Run Id:  AutoML_263d3151-5dc6-46d2-a5d3-f798006c5af8_76
Run(Experiment: bank-marketing-automl,
Id: AutoML_263d3151-5dc6-46d2-a5d3-f798006c5af8_76,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                  loss='modified_huber',
                                                                           