In [1]:
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace

# from dotenv import load_dotenv

# load_dotenv()
# ws = Workspace.create(name='bank_marketing',
#                subscription_id=os.getenv('subscription_id'),
#                resource_group='rg20210512',
#                create_resource_group=True,
#                location='westus2'
#                )

ws = Workspace.from_config()
experiment_name = 'automl'
exp = Experiment(ws, experiment_name)

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for the cluster
cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    print('Creating a new compute cluster...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)
print(compute_target.get_status().serialize())

Creating a new compute cluster...
Creating...
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-05-13T00:29:56.691000+00:00', 'errors': None, 'creationTime': '2021-05-13T00:29:54.004765+00:00', 'modifiedTime': '2021-05-13T00:30:09.489259+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [3]:
from train import data_split
from azureml.core.dataset import Dataset 

train_data, val_data, test_data = data_split()
datastore = ws.get_default_datastore()
train_ds = Dataset.Tabular.register_pandas_dataframe(dataframe=train_data, 
                                                     target=datastore, 
                                                     name='train_data')
val_ds = Dataset.Tabular.register_pandas_dataframe(dataframe=val_data, 
                                                   target=datastore, 
                                                   name='val_data')
test_ds = Dataset.Tabular.register_pandas_dataframe(dataframe=test_data, 
                                                    target=datastore, 
                                                    name='test_data')

Method register_pandas_dataframe: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/98a746b8-e186-4b76-a6f0-ac4c75936aa3/
Successfully uploaded file to datastore.
Creating and registering a new dataset.


Method register_pandas_dataframe: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/a4f36a78-a6a6-4819-914c-69b4c72f2a40/
Successfully uploaded file to datastore.
Creating and registering a new dataset.


Method register_pandas_dataframe: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/a530f5b6-9bf5-40fa-afb8-c96ff15cc254/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [4]:
from azureml.train.automl import AutoMLConfig

automl_settings = {
    "experiment_timeout_hours" : 0.5,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    "primary_metric": 'AUC_weighted',
    "featurization": 'off'
}

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             experiment_exit_score = 0.95,
                             enable_onnx_compatible_models=True,
                             training_data = train_ds,
                             label_column_name = 'y_yes',
                             validation_data = val_ds,
                             **automl_settings
                            )
automl_run = exp.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
automl,AutoML_71126236-5596-45cb-bdd3-b2ef2313e93c,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   

In [5]:
automl_run.wait_for_completion()

{'runId': 'AutoML_71126236-5596-45cb-bdd3-b2ef2313e93c',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-05-13T00:30:51.534001Z',
 'endTimeUtc': '2021-05-13T01:01:54.800919Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"e88b4ff0-4f86-48f7-832d-d99a755d911e\\"}, \\"validation_data\\": {\\"datasetId\\": \\"0f90b96a-cfee-4c7d-b2a2-7e13d0ba3c3c\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.28.0", "azureml-train": "1.28.0", "azureml-train-restclients-hyperdrive": "1.28.0", "azureml-train-core": "1.28.0", "azureml-train-automl": "1.28.

In [6]:
# Returns the best automl run
automl_best_run = automl_run.get_best_child()

print("Best run metrics :", automl_best_run.get_metrics())
print(' ')
print("Best run details :", automl_best_run.get_details())
print(' ')
print("Best run file names :", automl_best_run.get_file_names())
print(' ')

Best run metrics : {'precision_score_macro': 0.7107762346697561, 'average_precision_score_weighted': 0.9552520139535774, 'balanced_accuracy': 0.8946071668506166, 'precision_score_micro': 0.8553046856033018, 'average_precision_score_macro': 0.8211435107045273, 'matthews_correlation': 0.5767974091567097, 'AUC_micro': 0.947861982448475, 'weighted_accuracy': 0.8456916673739526, 'precision_score_weighted': 0.9298559696937277, 'accuracy': 0.8553046856033018, 'recall_score_macro': 0.8946071668506166, 'log_loss': 0.3078392393505284, 'recall_score_micro': 0.8553046856033018, 'f1_score_macro': 0.7513898295747308, 'f1_score_micro': 0.8553046856033018, 'f1_score_weighted': 0.8766105479358589, 'norm_macro_recall': 0.7892143337012332, 'AUC_macro': 0.9491698258073804, 'recall_score_weighted': 0.8553046856033018, 'AUC_weighted': 0.9491698258073804, 'average_precision_score_micro': 0.9505405171009824, 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.AutoML_71126236-5596-45cb-bdd3-b2ef2313e93c_6

Best run file names : ['accuracy_table', 'automl_driver.py', 'azureml-logs/55_azureml-execution-tvmps_6f521dd5dceb1087a274a66605d023e00abc8da75200d0f321ea96528ce573fe_d.txt', 'azureml-logs/65_job_prep-tvmps_6f521dd5dceb1087a274a66605d023e00abc8da75200d0f321ea96528ce573fe_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_6f521dd5dceb1087a274a66605d023e00abc8da75200d0f321ea96528ce573fe_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'confusion_matrix', 'logs/azureml/104_azureml.log', 'logs/azureml/azureml_automl-child.log', 'logs/azureml/azureml_automl.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/conda_env_v_1_0_0.yml', 'outputs/env_dependencies.json', 'outputs/model.onnx', 'outputs/model.pkl', 'outputs/model_onnx.json', 'outputs/pipeline_graph.json', 'outputs/scoring_file_v_1_0_0.py']
 


In [7]:
# Retrieve and save the best automl model
import joblib
_, best_model = automl_run.get_output()
joblib.dump(best_model, filename='outputs/automl_best_model.pkl')

['outputs/automl_best_model.pkl']

In [8]:
from pprint import pprint

def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(best_model)

prefittedsoftvotingclassifier
{'estimators': ['49', '42', '63', '43', '54', '0', '30'],
 'weights': [0.14285714285714285,
             0.35714285714285715,
             0.07142857142857142,
             0.07142857142857142,
             0.07142857142857142,
             0.14285714285714285,
             0.14285714285714285]}

49 - StandardScalerWrapper
{'class_name': 'StandardScaler',
 'copy': True,
 'module_name': 'sklearn.preprocessing._data',
 'with_mean': False,
 'with_std': False}

49 - XGBoostClassifier
{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 0.8,
 'eta': 0.5,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 5,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'n_estimators': 100,
 'n_jobs': -1,
 'nthread': None,
 'objective': 'reg:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1.4583333333333335,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsam

In [9]:
from sklearn.metrics import roc_auc_score
X_test = test_data.drop(columns=['y_yes'])
y_test = test_data['y_yes']
y_prob = best_model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_prob[:, 1], average="weighted")
auc

0.9460346769305918

In [14]:
# compute_target.delete()