In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()

experiment = Experiment(workspace=ws, name="train_bankmarketing")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: quick-starts-ws-131774
Azure region: southcentralus
Subscription id: 61c5c3f0-6dc7-4ed9-a7f3-c704b20e3b30
Resource group: aml-quickstarts-131774


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "cpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

# can poll for a minimum number of nodes and for a specific timeout. 
# if no min node count is provided it uses the scale settings for the cluster
compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing compute target
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-12-25T23:32:51.426000+00:00', 'errors': None, 'creationTime': '2020-12-25T22:16:26.094721+00:00', 'modifiedTime': '2020-12-25T22:16:41.505104+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


**Create a project directory**
Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. 
This includes the training script and any additional files your training script depends on.

In [3]:
import os

project_folder = './bankmarketing'
os.makedirs(project_folder, exist_ok=True)

** Copy train.py script to project folder **

In [4]:
import shutil

shutil.copy('train.py', project_folder)

'./bankmarketing/train.py'

**Create an environment**

Define a conda environment YAML file with your training script dependencies and create an Azure ML environment.

In [5]:
%%writefile conda_dependencies.yml

dependencies:
- python=3.6.2
- scikit-learn
- pip:
  - azureml-defaults

Overwriting conda_dependencies.yml


In [6]:
from azureml.core import Environment

sklearn_env = Environment.from_conda_specification(name = 'sklearn-env', file_path = './conda_dependencies.yml')


** Configure the training job **

Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on.

In [7]:
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory=project_folder,
                      script='train.py',
                      arguments=['--C', 1, '--max_iter', 100, '--solver', 'lbfgs'],
                      compute_target=compute_target,
                      environment=sklearn_env)

**Submit job**

Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous.

In [8]:
run = experiment.submit(src)

**Monitor your run**

You can monitor the progress of the run with a Jupyter widget. 
Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes.

In [9]:
from azureml.widgets import RunDetails

RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [24]:
run.wait_for_completion(show_output=True)

RunId: train_bankmarketing_1608940119_ef6ed212
Web View: https://ml.azure.com/experiments/train_bankmarketing/runs/train_bankmarketing_1608940119_ef6ed212?wsid=/subscriptions/61c5c3f0-6dc7-4ed9-a7f3-c704b20e3b30/resourcegroups/aml-quickstarts-131774/workspaces/quick-starts-ws-131774

Execution Summary
RunId: train_bankmarketing_1608940119_ef6ed212
Web View: https://ml.azure.com/experiments/train_bankmarketing/runs/train_bankmarketing_1608940119_ef6ed212?wsid=/subscriptions/61c5c3f0-6dc7-4ed9-a7f3-c704b20e3b30/resourcegroups/aml-quickstarts-131774/workspaces/quick-starts-ws-131774



{'runId': 'train_bankmarketing_1608940119_ef6ed212',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-12-25T23:53:24.633867Z',
 'endTimeUtc': '2020-12-25T23:55:46.239761Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '82d57298-8b52-46c5-a9ed-ceaf44f2cbd4',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'train.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--C', '1', '--max_iter', '100', '--solver', 'lbfgs'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cpu-cluster',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 1,
  'priority': None,
  'credentialPassthrough': False,
  'environment': {'name': 'sklearn-env',
   'version': 'Autosave

In [None]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive.parameter_expressions import choice
import os

# Specify parameter sampler
ps = RandomParameterSampling( {
    "--C": choice(0.01, 0.1, 1, 10, 100),
    "--max_iter": choice(100, 200, 500),
    "--solver": choice('newton-cg', 'lbfgs', 'liblinear'),    
    }
)


# Specify a Policy
# The BanditPolicy basically states to check the job every 2 iterations. 
# If the primary metric (defined later) falls outside of the top 10% range, Azure ML 
# terminate the job. This saves us from continuing to explore hyperparameters that don't 
# show promise of helping reach our target metric.
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

# Create a SKLearn estimator for use with train.py
# est = ### YOUR CODE HERE ###
# I decided not to use the estimator because the documentation (https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.sklearn.sklearn) says that it is deprecated

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    run_config=src,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name='Accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=20,
    max_concurrent_runs=4)

In [None]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

# start the HyperDrive run
hyperdrive_run = experiment.submit(hyperdrive_config)

**Monitor HyperDrive runs**

Monitor the progress of the runs with the following Jupyter widget.

In [None]:
RunDetails(hyperdrive_run).show()

In [None]:
hyperdrive_run.wait_for_completion(show_output=True)

In [None]:
assert(hyperdrive_run.get_status() == "Completed")

In [23]:
import joblib
from sklearn.linear_model import LogisticRegression
from azureml.data.dataset_factory import TabularDatasetFactory
from train import clean_data
from sklearn.model_selection import train_test_split

# Create TabularDataset using TabularDatasetFactory
dataset_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path = dataset_path)

x, y = clean_data(ds)

# Split data into train and test sets.
x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.33, random_state=42)

best_run = hyperdrive_run.get_best_run_by_primary_metric()
arguments = best_run.get_details()['runDefinition']['arguments']
print(best_run.get_file_names())

model = LogisticRegression(
    C=int(arguments[1]), 
    max_iter=int(arguments[3]), 
    solver=arguments[5])
model.fit(x_train, y_train)
filename = 'best_model.sav'

joblib.dump(model, filename)

['azureml-logs/55_azureml-execution-tvmps_8ba53da3e560c6226036acc4837678958a7d2b69a2c3e99a54311c8a7bb503f3_d.txt', 'azureml-logs/65_job_prep-tvmps_8ba53da3e560c6226036acc4837678958a7d2b69a2c3e99a54311c8a7bb503f3_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_8ba53da3e560c6226036acc4837678958a7d2b69a2c3e99a54311c8a7bb503f3_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/99_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'logs/azureml/dataprep/engine_spans_l_5d6376fc-a00d-49a7-b7c8-0a029e3cad90.jsonl', 'logs/azureml/dataprep/python_span_l_5d6376fc-a00d-49a7-b7c8-0a029e3cad90.jsonl', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log']


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


['best_model.sav']

In [None]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

dataset_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path = dataset_path)

validation_data = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv"
validation_dataset = Dataset.Tabular.from_delimited_files(validation_data)

In [None]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [None]:
from azureml.train.automl import AutoMLConfig
from sklearn.model_selection import train_test_split

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='Accuracy',
    training_data=ds,
    label_column_name='y',
    n_cross_validations=)

In [None]:
automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             experiment_exit_score = 0.9984,
                             blocked_models = ['KNN','LinearSVM'],
                             enable_onnx_compatible_models=True,
                             training_data = train_data,
                             label_column_name = label,
                             validation_data = validation_dataset,
                             **automl_settings
                            )

In [None]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###