In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-237770
Azure region: eastus2
Subscription id: 48a74bb7-9950-4cc1-9caa-5d50f995cc55
Resource group: aml-quickstarts-237770


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.exceptions import ComputeTargetException

cluster_name = "my-cluster"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_D2_V2", max_nodes=4
    )
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)


Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
ps = RandomParameterSampling(
    {
        '--C': uniform(0.5,1.5),
        '--max_iter': choice(75, 100, 125)
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(source_directory='.',
                      script='train.py',
                      arguments=None,
                      compute_target=compute_target,
                      environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)

In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(config=hyperdrive_config)
RunDetails(hyperdrive_run).show()


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [5]:
import joblib
# Get your best run and save the model from that run.

hyperdrive_run.wait_for_completion(show_output=True)
assert(hyperdrive_run.get_status() == "Completed")

best_run = hyperdrive_run.get_best_run_by_primary_metric()
#best_run.get_details()
best_metrics = best_run.get_metrics()
print(best_metrics)

#save the best tunned parameters
joblib.dump(best_metrics,'./training/hyperdrive_metircs.json')

RunId: HD_8884f128-fc32-49b0-b408-4c3f481d333d
Web View: https://ml.azure.com/runs/HD_8884f128-fc32-49b0-b408-4c3f481d333d?wsid=/subscriptions/48a74bb7-9950-4cc1-9caa-5d50f995cc55/resourcegroups/aml-quickstarts-237770/workspaces/quick-starts-ws-237770&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

[2023-06-27T04:41:58.839648][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space
[2023-06-27T04:41:59.2204707Z][SCHEDULER][INFO]Scheduling job, id='HD_8884f128-fc32-49b0-b408-4c3f481d333d_0' 
[2023-06-27T04:41:59.3481068Z][SCHEDULER][INFO]Scheduling job, id='HD_8884f128-fc32-49b0-b408-4c3f481d333d_1' 
[2023-06-27T04:41:59.4536309Z][SCHEDULER][INFO]Scheduling job, id='HD_8884f128-fc32-49b0-b408-4c3f481d333d_2' 
[2023-06-27T04:41:59.537970][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.
[2023-06-27T04:41:59.5697490Z][SCHEDULER][INFO]Scheduling job, id='HD_8884f128-fc32-49b0-b408-4c3f481d33

['./training/hyperdrive_metircs.json']

In [6]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

from azureml.data.dataset_factory import TabularDatasetFactory

web_path = [
    'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
]

ds = TabularDatasetFactory.from_delimited_files(path=web_path)


In [7]:
from train import clean_data
from azureml.core.dataset import Dataset
import pandas as pd
import os

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

#prepare automl training data from cleaned x and y
train_data = x.join(y)
print(train_data.head(2))

if not os.path.isdir("data"):
    os.mkdir("data")
    
pd.DataFrame(train_data).to_csv("data/train_data.csv", index=False)
datastore = ws.get_default_datastore()
datastore.upload(
    src_dir="./data", target_path="bankmarketing", overwrite=True, show_progress=True
)

# Upload the training data as a tabular dataset for access during training on remote compute
train_ds = Dataset.Tabular.from_delimited_files(
    path=datastore.path("bankmarketing/train_data.csv")
)

   age  marital  default  housing  loan  month  day_of_week  duration  \
0   57        1        0        0     1      5            1       371   
1   55        1        0        1     0      5            4       285   

   campaign  pdays  ...  contact_telephone  education_basic.4y  \
0         1    999  ...                  0                   0   
1         2    999  ...                  1                   0   

   education_basic.6y  education_basic.9y  education_high.school  \
0                   0                   0                      1   
1                   0                   0                      0   

   education_illiterate  education_professional.course  \
0                     0                              0   
1                     0                              0   

   education_university.degree  education_unknown  y  
0                            0                  0  0  
1                            0                  1  0  

[2 rows x 40 columns]
Uploading an 

"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


In [8]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="AUC_weighted",
    compute_target=compute_target,
    training_data=train_ds,
    label_column_name="y",
    blocked_models=["KNN", "LinearSVM"],
    max_concurrent_iterations=4,
    enable_onnx_compatible_models=True,
    n_cross_validations=5)

In [9]:
# Submit your automl run

auto_run = exp.submit(config=automl_config)

RunDetails(auto_run).show()

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_fa57a6f5-6633-47b1-a829-8d852faaaf70,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [10]:
# Retrieve and save your best automl model.


auto_run.wait_for_completion(show_output=True)
assert(auto_run.get_status() == "Completed")

best_auto_run, fitted_model = auto_run.get_output()
#print(best_auto_run.get_details())
#save the model
joblib.dump(fitted_model,'./training/automl_model.pkl')


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_fa57a6f5-6633-47b1-a829-8d852faaaf70,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation




********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+--------------------------------------+
|Size of the smallest class    |Name/Label of the smallest class|Number of samples in the training data|
|3692                          |1                               |32950                                 |
+------------------------------+--------------------------------+--------------------------------------+

********************************************************************

Package:azureml-automl-runtime, training version:1.51.0.post2, current version:1.49.0
Package:azureml-core, training version:1.51.0, current version:1.49.0
Package:azureml-dataprep, training version:4.10.8, current version:4.9.1
Package:azureml-dataprep-rslex, training version:2.17.12, current version:2.16.1
Package:azureml-dataset-runtime, training version:1.51.0, current version:1.49.0
Package:azureml-defaults, training version:1.51.0, current version:1.49.0
Package:azureml-interpret, training version:1.51.0, current version:1.49.0
Package:azureml-mlflow, training version:1.51.0, current version:1.49.0
Package:azureml-pipeline-core, training version:1.51.0, current version:1.49.0
Package:azureml-responsibleai, training version:1.51.0, current version:1.49.0
Package:azureml-telemetry, training version:1.51.0, current version:1.49.0
Package:azureml-train-automl-client, training version:1.51.0.post1, current version:1.49.0
Package:azureml-train-automl-runtime, training version:1.51.0.po

['./training/automl_model.pkl']

Current provisioning state of AmlCompute is "Deleting"



In [11]:
print(compute_target.get())
compute_target.delete()



In [12]:
print(compute_target.get())

None
