In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="HyperDrive_Experiment")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-189979
Azure region: southcentralus
Subscription id: 81cefad3-d2c9-4f77-a466-99a7f541c7bb
Resource group: aml-quickstarts-189979


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "MyComputeName1"

try:
    compute_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    cluster_config = AmlCompute.provisioning_configuration(vm_size='Standard_D2_V2', max_nodes=4)
    compute_cluster = ComputeTarget.create(ws, cluster_name, cluster_config)

compute_cluster.wait_for_completion(show_output=True)

# get a detailed status for the current cluster.
print(compute_cluster.get_status().serialize())

Found existing compute target
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2022-03-25T15:19:37.275000+00:00', 'errors': None, 'creationTime': '2022-03-25T15:19:33.631401+00:00', 'modifiedTime': '2022-03-25T15:19:37.291797+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS3_V2'}


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os, logging
logging.basicConfig(level=logging.INFO)

# Create a directory that will contain all the necessary code.
#project_folder = 'Final_Project'
#os.makedirs(project_folder, exist_ok=True)

# parameter sampler
param_sampling = RandomParameterSampling( {
        '--C': choice(0.1, 1, 10, 100, 1000), 
        #'--max_iter': uniform(1, 500)
        '--max_iter': choice(1, 50, 100, 200, 500)
    })

# End poorly performing runs with an early termination policy (improves computational efficiency).
early_termination_policy = BanditPolicy(slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of training job
src = ScriptRunConfig(source_directory=os.getcwd(),
                      script='train.py',
                      #arguments=['--C', '--max_iter'],
                      compute_target=compute_cluster,
                      environment=sklearn_env)

# Create a HyperDriveConfig.
# The name of the primary metric needs to exactly match the name of the metric logged by the training script.
hyperdrive_config = HyperDriveConfig(run_config=src,
                             hyperparameter_sampling=param_sampling,
                             policy=early_termination_policy,
                             primary_metric_name="Accuracy",
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs=4,
                             max_concurrent_runs=4)

In [4]:
# Submit the experiment (start the HyperDrive run).
hyperdrive_run = exp.submit(hyperdrive_config, show_output=True)

In [5]:
# Visualize all hyperparameter tuning runs.
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_4d017914-26ae-44ff-aa18-a793fb976a1d
Web View: https://ml.azure.com/runs/HD_4d017914-26ae-44ff-aa18-a793fb976a1d?wsid=/subscriptions/81cefad3-d2c9-4f77-a466-99a7f541c7bb/resourcegroups/aml-quickstarts-189979/workspaces/quick-starts-ws-189979&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

"<START>[2022-03-25T15:29:30.946779][API][INFO]Experiment created<END>\n""<START>[2022-03-25T15:29:31.759146][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2022-03-25T15:29:32.360346][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_4d017914-26ae-44ff-aa18-a793fb976a1d
Web View: https://ml.azure.com/runs/HD_4d017914-26ae-44ff-aa18-a793fb976a1d?wsid=/subscriptions/81cefad3-d2c9-4f77-a466-99a7f541c7bb/resourcegroups/aml-quickstarts-189979/workspaces/quick-starts-ws-189979&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254



{'runId': 'HD_4d017914-26ae-44ff-aa18-a793fb976a1d',
 'target': 'MyComputeName1',
 'status': 'Completed',
 'startTimeUtc': '2022-03-25T15:29:30.720227Z',
 'endTimeUtc': '2022-03-25T15:38:09.64852Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'd4e9b831-2bd5-43f2-ac3e-cdb339d4150f',
  'user_agent': 'python/3.6.9 (Linux-5.4.0-1068-azure-x86_64-with-debian-buster-sid) msrest/0.6.21 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.38.0',
  'space_size': '25',
  'score': '0.9083459787556905',
  'best_child_run_id': 'HD_4d017914-26ae-44ff-aa18-a793fb976a1d_0',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg189979.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_4d017914-26ae-44ff-aa18-a793fb976a1d/azureml

In [6]:
assert(hyperdrive_run.get_status() == "Completed")

In [7]:
import joblib

# Best performing configuration and hyperparameter values.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['Accuracy'])
print('\n learning rate:',parameter_values[3])
#print('\n keep probability:',parameter_values[5])
#print('\n batch size:',parameter_values[7])

# List the model files uploaded during the run.
# print('\n Model files:', best_run.get_file_names())

Best Run Id:  HD_4d017914-26ae-44ff-aa18-a793fb976a1d_0

 Accuracy: 0.9083459787556905

 learning rate: 500


In [8]:
# Save the model from that run.
model_hyperdrive = best_run.register_model(model_name='model_hyperdrive', model_path='outputs/models/model.joblib')

In [9]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset
data_location = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path=data_location, validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False)

In [10]:
from train import clean_data
from sklearn.model_selection import train_test_split
import pandas as pd

# clean data.
x, y = clean_data(ds)

# y is a pandas Series. Convert to dataframe.
y_df = pd.DataFrame(y,columns=['y'])

# Combine both dataframes : x and y_df.
combined_data = pd.concat([x,y_df],axis=1)

# Split data into train and test sets.
train_data, test_data = train_test_split(combined_data, test_size = 0.2, random_state = 42)

# Convert training data to csv and save in training folder.
train_data.to_csv('training/automl_training_data.csv')

# get the datastore to upload prepared data
datastore_automl = ws.get_default_datastore()

# load the dataset from training to data folder
datastore_automl.upload(src_dir='training/',target_path='data/')

# convert data to tabular format
automl_train_dataset = TabularDatasetFactory.from_delimited_files(path=[(datastore_automl,('data/automl_training_data.csv'))])

INFO:matplotlib.font_manager:generated new fontManager
INFO:azureml.data.datastore_client:<azureml.core.authentication.InteractiveLoginAuthentication object at 0x7fb0eee3c5c0>
INFO:azureml.data.azure_storage_datastore:Called AzureBlobDatastore.upload
INFO:azureml.data.azure_storage_datastore:Uploading an estimated of 1 files


Uploading an estimated of 1 files
Uploading training/automl_training_data.csv
Uploaded training/automl_training_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [11]:
from azureml.train.automl import AutoMLConfig

automl_settings = {
       "n_cross_validations": 3,
       "primary_metric": 'accuracy',
       "experiment_timeout_minutes":30,
       "verbosity": logging.INFO,
   }

# Set parameters for AutoMLConfig
automl_config = AutoMLConfig(task = 'classification',
                             compute_target = compute_cluster,
                             training_data = automl_train_dataset,
                             label_column_name='y',
                             **automl_settings
                             )

In [12]:
# define experiment
exp_automl = Experiment(workspace=ws, name="Automl_Experiment")

# Submit automl run (show Validation errors and current status).
automl_run = exp_automl.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on MyComputeName1 with default configuration
Running on remote compute: MyComputeName1


Experiment,Id,Type,Status,Details Page,Docs Page
Automl_Experiment,AutoML_62dce8dd-4bbc-4f84-bab4-7a742d254f0f,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+-------------------------------------

In [13]:
from azureml.widgets import RunDetails

RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Experiment,Id,Type,Status,Details Page,Docs Page
Automl_Experiment,AutoML_62dce8dd-4bbc-4f84-bab4-7a742d254f0f,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation




********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+--------------------------------------+
|Size of the smallest class    |Name/Label of the smallest class|Number of samples in the training data|
|2951                          |1                               |26360                                 |
+------------------------------+--------------------------------+--------------------------------------+

********************************************************************

{'runId': 'AutoML_62dce8dd-4bbc-4f84-bab4-7a742d254f0f',
 'target': 'MyComputeName1',
 'status': 'Completed',
 'startTimeUtc': '2022-03-25T15:43:19.813515Z',
 'endTimeUtc': '2022-03-25T16:14:04.217531Z',
 'services': {},
   'message': 'No scores improved over last 20 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'MyComputeName1',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"0fa6fbab-e156-40c0-969f-db959dccbe73\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-wid

In [14]:
# Retrieve the best Run object
#automl_best_run = automl_run.get_best_child()

# Retrieve the best run and the fitted model.
automl_best_run, fitted_model = automl_run.get_output()

INFO:pytorch_transformers.modeling_bert:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
INFO:pytorch_transformers.modeling_xlnet:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [17]:
# Save best automl model.
automl_best_run.register_model(model_name = "automl_model.pkl", model_path = './outputs/')
print(fitted_model._final_estimator)

PreFittedSoftVotingClassifier(
    estimators=[('1', Pipeline(
        memory=None,
        steps=[('maxabsscaler', MaxAbsScaler(
            copy=True
        )), ('xgboostclassifier', XGBoostClassifier(
            random_state=0,
            n_jobs=1,
            problem_info=ProblemInfo(
                gpu_training_param_dict={'processing_unit_type': 'cpu'}
            ),
            tree_method='auto'
        ))],
        verbose=False
    )), ('0', Pipeline(
        memory=None,
        steps=[('maxabsscaler', MaxAbsScaler(
            copy=True
        )), ('lightgbmclassifier', LightGBMClassifier(
            min_data_in_leaf=20,
            random_state=None,
            n_jobs=1,
            problem_info=ProblemInfo(
                gpu_training_param_dict={'processing_unit_type': 'cpu'}
            )
        ))],
        verbose=False
    )), ('14', Pipeline(
        memory=None,
        steps=[('standardscalerwrapper', StandardScalerWrapper(
            copy=True,
        

In [19]:
# delete compute cluster
compute_cluster.delete()