In [2]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="optimizing-pipeline-dev")
exp = Experiment(workspace=ws, name="optimizing-pipeline")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: optimizing-pipeline-dev
Azure region: eastus2
Subscription id: 15e6db88-22a3-4061-9e46-41d276285476
Resource group: basic


In [6]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cpu_cluster_name = "default"

# verifying that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.

Running


In [15]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
import os

# setting parameter sampling
param_sampling = RandomParameterSampling( {
    "--C": choice(.25,.5,.75,1.0),
    "--max_iter": choice(25,50,75,100)
    }
)

# specifying policy
policy = BanditPolicy(evaluation_interval=20, slack_factor=None, slack_amount=0.2, delay_evaluation=0)

if "training" not in os.listdir():
    os.mkdir("./training")

# creating estimator
estimator = SKLearn(source_directory='./',
                compute_target=cpu_cluster,
                vm_size='STANDARD_D2_V2',
                entry_script="train.py",

)

# creating HyperDriveConfig
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling = param_sampling,
                                    primary_metric_name ='Accuracy',
                                    primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                                    estimator=estimator,
                                    max_total_runs = 20,
                                    max_concurrent_runs=4
)



In [16]:
# submiting hyperdrive run
hyperdrive_run = exp.submit(config=hyperdrive_config)
hyperdrive_run.wait_for_completion(show_output=True)
RunDetails(hyperdrive_run).show()



RunId: HD_40e3bb47-7321-4da7-a41f-4a14c469303a
Web View: https://ml.azure.com/experiments/optimizing-pipeline/runs/HD_40e3bb47-7321-4da7-a41f-4a14c469303a?wsid=/subscriptions/15e6db88-22a3-4061-9e46-41d276285476/resourcegroups/basic/workspaces/optimizing-pipeline-dev

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-09T20:53:12.811478][API][INFO]Experiment created<END>\n""<START>[2021-01-09T20:53:13.488064][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"<START>[2021-01-09T20:53:13.9352466Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2021-01-09T20:53:13.893892][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_40e3bb47-7321-4da7-a41f-4a14c469303a
Web View: https://ml.azure.com/experiments/optimizing-pipeline/runs/HD_40e3bb47-7321-4da7-a41f-4a14c469303a?wsid=/subscriptions/15e6db88-22a3

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [19]:
import joblib
# getting the best run and saving best model

best_run = hyperdrive_run.get_best_run_by_primary_metric()
param_values = best_run.get_details()['runDefinition']['arguments']
print(param_values)

['--C', '0.25', '--max_iter', '75']


In [26]:
best_run.download_file(best_run.get_file_names()[-1], output_file_path='./outputs/')

In [4]:
from azureml.data.dataset_factory import TabularDatasetFactory

ds =  TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")

In [5]:
from train import clean_data
from azureml.core.dataset import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

x, y = clean_data(ds)

joined_ds = x.join(y)
train_set, test_set = train_test_split(joined_ds, test_size=0.33, random_state=42)

if not os.path.isdir('data'):
    os.mkdir('data')

pd.DataFrame(train_set).to_csv("data/train_set.csv", index=False)

ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='train_data', overwrite=True, show_progress=True)

train_data = Dataset.Tabular.from_delimited_files(path=ds.path('train_data/train_set.csv'))


Uploading an estimated of 2 files
Uploading ./data/train_df.csv
Uploaded ./data/train_df.csv, 1 files out of an estimated total of 2
Uploading ./data/train_set.csv
Uploaded ./data/train_set.csv, 2 files out of an estimated total of 2
Uploaded 2 files


In [8]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    compute_target=cpu_cluster,
    task='classification',
    primary_metric='accuracy',
    training_data=train_data,
    label_column_name='y',
    n_cross_validations=5)

In [9]:
# submitting automl run
automl_run = exp.submit(config=automl_config)
automl_run.wait_for_completion(show_output=True)
RunDetails(automl_run).show()

Running on remote.



In [3]:
# retreiving and saving best automl model
best_automl_run, fitted_automl_model = automl_run.get_output()
best_run.register_model(model_name = 'automl_best_model.pkl', model_path = './outputs/')

NameError: name 'automl_run' is not defined