In [1]:
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
experiment_name = 'automl'
exp = Experiment(ws, experiment_name)

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for the cluster
cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    print('Creating a new compute cluster...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)
print(compute_target.get_status().serialize())

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 4, 'targetNodeCount': 4, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 3, 'idleNodeCount': 1, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-05-12T02:17:23.430000+00:00', 'errors': None, 'creationTime': '2021-05-11T22:37:10.166981+00:00', 'modifiedTime': '2021-05-11T22:37:25.558188+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [3]:
from train import data_split
from azureml.core.dataset import Dataset 

train_data, val_data, test_data = data_split()
datastore = ws.get_default_datastore()
train_ds = Dataset.Tabular.register_pandas_dataframe(dataframe=train_data, 
                                                     target=datastore, 
                                                     name='train_data')
val_ds = Dataset.Tabular.register_pandas_dataframe(dataframe=train_data, 
                                                   target=datastore, 
                                                   name='val_data')
test_ds = Dataset.Tabular.register_pandas_dataframe(dataframe=train_data, 
                                                    target=datastore, 
                                                    name='test_data')

Method register_pandas_dataframe: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/7d4302b8-2b40-4839-b70d-d2583d0c2f76/
Successfully uploaded file to datastore.
Creating and registering a new dataset.


Method register_pandas_dataframe: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/e15d0601-c60a-4ab0-8063-dc728559276d/
Successfully uploaded file to datastore.
Creating and registering a new dataset.


Method register_pandas_dataframe: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/69b5ddfe-8ca7-41f3-a247-3863da4d2de3/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [4]:
from azureml.train.automl import AutoMLConfig

automl_settings = {
    "experiment_timeout_hours" : 0.5,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    "primary_metric": 'AUC_weighted',
    "featurization": 'off'
}

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             experiment_exit_score = 0.95,
                             enable_onnx_compatible_models=True,
                             training_data = train_ds,
                             label_column_name = 'y_yes',
                             validation_data = val_ds,
                             **automl_settings
                            )
automl_run = exp.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
automl,AutoML_5e45bd33-284b-47eb-9b2a-fffe05d9952d,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   

In [6]:
automl_run.wait_for_completion()

{'runId': 'AutoML_5e45bd33-284b-47eb-9b2a-fffe05d9952d',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-05-12T02:20:40.666456Z',
 'endTimeUtc': '2021-05-12T02:24:08.34835Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"dbacdeb7-d061-462f-8506-141f44290227\\"}, \\"validation_data\\": {\\"datasetId\\": \\"f67a7f27-8390-4f7a-8006-e4108ee921bd\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.28.0", "azureml-train": "1.28.0", "azureml-train-restclients-hyperdrive": "1.28.0", "azureml-train-core": "1.28.0", "azureml-train-automl": "1.28.0

In [7]:
# Returns the best automl run
automl_best_run = automl_run.get_best_child()

print("Best run metrics :", automl_best_run.get_metrics())
print(' ')
print("Best run details :", automl_best_run.get_details())
print(' ')
print("Best run file names :", automl_best_run.get_file_names())
print(' ')

Best run metrics : {'recall_score_micro': 0.8972282777950794, 'average_precision_score_micro': 0.9636939895524863, 'norm_macro_recall': 0.7944565555901588, 'precision_score_weighted': 0.9002282464483135, 'weighted_accuracy': 0.8972282777950794, 'precision_score_micro': 0.8972282777950794, 'accuracy': 0.8972282777950794, 'log_loss': 0.2529406356023127, 'AUC_macro': 0.9578908282132397, 'AUC_weighted': 0.9578908282132398, 'average_precision_score_macro': 0.9579485254956464, 'recall_score_weighted': 0.8972282777950794, 'f1_score_micro': 0.8972282777950794, 'AUC_micro': 0.9626407579209625, 'balanced_accuracy': 0.8972282777950794, 'average_precision_score_weighted': 0.9579485254956465, 'precision_score_macro': 0.9002282464483136, 'recall_score_macro': 0.8972282777950794, 'f1_score_weighted': 0.8970353311567164, 'matthews_correlation': 0.7974508814004989, 'f1_score_macro': 0.8970353311567164, 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.AutoML_5e45bd33-284b-47eb-9b2a-fffe05d9952d_

Best run file names : ['accuracy_table', 'automl_driver.py', 'azureml-logs/55_azureml-execution-tvmps_1c1a37658f29192eb6ed5658ccee553dd9a0043472a7d75081cd936528ce068f_d.txt', 'azureml-logs/65_job_prep-tvmps_1c1a37658f29192eb6ed5658ccee553dd9a0043472a7d75081cd936528ce068f_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_1c1a37658f29192eb6ed5658ccee553dd9a0043472a7d75081cd936528ce068f_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'confusion_matrix', 'explanation/3ac1cdf5/classes.interpret.json', 'explanation/3ac1cdf5/eval_data_viz.interpret.json', 'explanation/3ac1cdf5/expected_values.interpret.json', 'explanation/3ac1cdf5/features.interpret.json', 'explanation/3ac1cdf5/global_names/0.interpret.json', 'explanation/3ac1cdf5/global_rank/0.interpret.json', 'explanation/3ac1cdf5/global_values/0.interpret.json', 'explanation/3ac1cdf5/local_importance_values.interpret.json', 'explanation/3ac1cdf5/per_class_names/0.interpret.json', 'expla

In [12]:
# Retrieve and save the best automl model
import joblib
_, best_model = automl_run.get_output()
joblib.dump(best_model, filename='outputs/automl_best_model.pkl')

['outputs/automl_best_model.pkl']

In [19]:
from pprint import pprint

def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(best_model)

MaxAbsScaler
{'copy': True}

XGBoostClassifier
{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': nan,
 'n_estimators': 100,
 'n_jobs': -1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'tree_method': 'auto',
 'verbose': -10,
 'verbosity': 0}



In [20]:
from sklearn.metrics import roc_auc_score
X_test = test_data.drop(columns=['y_yes'])
y_test = test_data['y_yes']
y_prob = best_model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_prob[:, 1], average="weighted")
auc

0.9443593734452596

In [21]:
compute_target.delete()

Current provisioning state of AmlCompute is "Deleting"



In [22]:
compute_target.wait_for_completion(show_output=True)

Deleting........
SucceededProvisioning operation finished, operation "Succeeded"


ComputeTargetException: ComputeTargetException:
	Message: ComputeTargetNotFound: Compute Target with name cpu-cluster not found in provided workspace
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "ComputeTargetNotFound: Compute Target with name cpu-cluster not found in provided workspace"
    }
}