In [1]:
from azureml.core import Workspace, Experiment


ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

#run = exp.start_logging()

Workspace name: quick-starts-ws-146682
Azure region: southcentralus
Subscription id: 3e42d11f-d64d-4173-af9b-12ecaa1030b3
Resource group: aml-quickstarts-146682


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.exceptions import ComputeTargetException

# Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

from azureml.core.compute import AmlCompute, ComputeTarget
compute_name = "aml-cluster"

# checks to see if compute target already exists in workspace, else create it
try:
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2",
                                                   max_nodes=4)
    compute_target = ComputeTarget.create(workspace=ws, 
                                          name=compute_name, 
                                          provisioning_configuration=config)
    
compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, quniform
import os

# Specify parameter sampler
# From the Azure documentation
# Uniform takes args (min_value, max_value) of the distribution
ps = RandomParameterSampling( {
        "--C": uniform(0, 10),
        "--max_iter": quniform(20, 180, 1)
    }
)

# Specify a Policy
policy = BanditPolicy(slack_factor = 0.2, 
                      evaluation_interval=1, 
                      delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory = "./",entry_script = "train.py", compute_target=compute_target)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                             hyperparameter_sampling=ps,
                             policy=policy,
                             primary_metric_name="Accuracy",
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs=28,
                             max_concurrent_runs=28)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.
'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
run = exp.submit(hyperdrive_config)

RunDetails(run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [5]:
import joblib

# Get your best run and save the model from that run.
best_run = run.get_best_run_by_primary_metric()
print("Metrics :",best_run.get_metrics())


Metrics : {'Regularization Strength:': 5.0063448150371475, 'Max iterations:': 147, 'Accuracy': 0.9154274152756702}


In [8]:
joblib.dump(value=best_run, filename='outputs/bankmarketing_model.pkl')

In [9]:
run.get_children_sorted_by_primary_metric(top=1)

[{'run_id': 'HD_4b7798e4-66d0-46a8-9945-892cc245ee0a_4',
  'hyperparameters': '{"--C": 5.0063448150371475, "--max_iter": 147.0}',
  'best_primary_metric': 0.9154274152756702,
  'status': 'Completed'}]

In [10]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(path='https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv')


In [12]:
from train import clean_data
from sklearn.model_selection import train_test_split
import pandas as pd


# Use the clean_data function to clean your data.
x, y = clean_data(ds)

#Split data into train and test sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30)
df_train = pd.concat([x_train,y_train], axis=1)
df_test = pd.concat([x_test,y_test], axis=1)

In [13]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
automl_config = AutoMLConfig(
    experiment_timeout_minutes=15,
    task="classification",
    primary_metric="accuracy",
    training_data=df_train,
    label_column_name='y',
    n_cross_validations=3,
    )

In [14]:
# Submit your automl run
automl_run = exp.submit(automl_config, show_output=True)

No run_configuration provided, running on local with default configuration
Running in the active local environment.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_b8a93e45-b472-496f-9f3b-023136b25a61,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias toward

In [15]:
# Retrieve and save your best automl model.
a_best_run = automl_run.get_best_child()

a_best_run.get_metrics()



{'matthews_correlation': 0.5438826324702434,
 'norm_macro_recall': 0.49580546117413604,
 'precision_score_micro': 0.9166702079049731,
 'AUC_macro': 0.9477459365785693,
 'average_precision_score_micro': 0.9816891104984055,
 'balanced_accuracy': 0.747902730587068,
 'average_precision_score_macro': 0.8201535422703478,
 'accuracy': 0.9166702079049731,
 'log_loss': 0.1775501381553113,
 'AUC_weighted': 0.9477459365785693,
 'recall_score_weighted': 0.9166702079049731,
 'AUC_micro': 0.9808864104095708,
 'f1_score_micro': 0.9166702079049731,
 'precision_score_weighted': 0.91069166120429,
 'precision_score_macro': 0.7984473571848514,
 'recall_score_micro': 0.9166702079049731,
 'recall_score_macro': 0.747902730587068,
 'average_precision_score_weighted': 0.9549116377852803,
 'f1_score_macro': 0.769729087806161,
 'f1_score_weighted': 0.9128013281151524,
 'weighted_accuracy': 0.9582311605654329,
 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_b8a93e45-b472-496f-9f3b-023136b25a61_30/a

In [18]:
a_best_run.get_details()

{'runId': 'AutoML_b8a93e45-b472-496f-9f3b-023136b25a61_30',
 'status': 'Completed',
 'startTimeUtc': '2021-06-07T04:49:53.890258Z',
 'endTimeUtc': '2021-06-07T04:50:23.313021Z',
 'properties': {'runTemplate': 'automl_child',
  'pipeline_id': '__AutoML_Ensemble__',
  'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'udacity-project\',\'compute_target\':\'local\',\'subscription_id\':\'3e42d11f-d64d-4173-af9b-12ecaa1030b3\',\'region\':\'southcentralus\',\'spark_service\':None}","ensemble_run_id":"AutoML_b8a93e45-b472-496f-9f3b-023136b25a61_30","experiment_name":null,"workspace_name":"quick-starts-ws-146682","subscription_id":"3e42d11f-d64d-4173-af9b-12ecaa1030b3","resource_group_name":"aml-qui

In [17]:
joblib.dump(value=a_best_run, filename='outputs/a_bankmarketing_model.pkl')

In [19]:
compute_target.delete()