In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-122523")

exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-122523
Azure region: southcentralus
Subscription id: 8e713106-916f-4177-890e-435b90d7adc4
Resource group: aml-quickstarts-122523


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cpu_cluster_name = "udacity-first-project"

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.

Running


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn

from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive import choice
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

# Specify parameter sampler
ps = RandomParameterSampling(
    {
    'C': choice(100, 10, 1.0, 0.1, 0.01),
    'max_iter': choice(100, 1000, 10000)
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) 

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(entry_script='./train.py', source_directory=".", compute_target=cpu_cluster)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=4,
                                     max_concurrent_runs=4
                                    )

In [4]:
from azureml.core.experiment import Experiment

experiment = Experiment(ws, "hyperparamenter_Tuning")
run = experiment.submit(config=hyperdrive_config, show_output=True)

RunDetails(run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [6]:
import joblib
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

# Get your best run and save the model from that run.
best_run = run.get_best_run_by_primary_metric()

model = best_run.register_model(model_name='sklearn-lr', 
                                model_path='./outputs/model.joblib', 
                                model_framework=Model.Framework.SCIKITLEARN, 
                                model_framework_version='0.22.2',
                                resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=7.0)
                               )

print(best_run.get_metrics())

print(model)

{'Regularization Strength:': 10.0, 'Max iterations:': 10000, 'Accuracy': 0.910152657715652}
Model(workspace=Workspace.create(name='quick-starts-ws-122523', subscription_id='8e713106-916f-4177-890e-435b90d7adc4', resource_group='aml-quickstarts-122523'), name=sklearn-lr, id=sklearn-lr:1, version=1, tags={}, properties={})


In [7]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv", validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False)

In [10]:
from train import clean_data
import os

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

data_train = x
data_train['y'] = y
data_train.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
0,57,1,0,0,1,5,1,371,1,999,...,0,0,0,0,1,0,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,1,0,0,0,0,0,0,0,1,0
2,33,1,0,0,0,5,5,52,1,999,...,0,0,0,1,0,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,1,0,0,0,1,0,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,0,0,0,0,1,0,0,0,0,0


In [11]:
from azureml.core import Dataset
from azureml.data.dataset_factory import DataType

if "data" not in os.listdir():
    os.mkdir("./data")

local_path = './data/data_clean.csv'
data_train.to_csv(local_path)

datastore = ws.get_default_datastore()

datastore.upload(src_dir='data', target_path='data')

datastore_paths = [(datastore, 'data/data_clean.csv')]

dataset = Dataset.Tabular.from_delimited_files(path=datastore_paths)

training_data, validation_data = dataset.random_split(percentage=0.8, seed=1)

Uploading an estimated of 1 files
Uploading data/data_clean.csv
Uploaded data/data_clean.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [16]:
from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task = 'classification',
                             experiment_timeout_minutes=30,
                             primary_metric_name='accuracy',
                             blocked_models=['XGBoostClassifier', 'MaxAbsScaler SVM'],
                             training_data=training_data,
                             validation_data= validation_data,
                             label_column_name = 'y',
                             compute_target=cpu_cluster
                            )

In [15]:
# Submit your automl run
automl_run = experiment.submit(automl_config, show_output = True)



Running on remote.
Running on remote compute: udacity-first-project
Parent Run ID: AutoML_efddcbd3-b43c-4430-95e8-61ec55cf5eec

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Lab



         0   MaxAbsScaler LightGBM                          0:00:31       0.9137    0.9137
         1   MinMaxScaler RandomForest                      0:00:29       0.8985    0.9137
         2   StandardScalerWrapper SGD                      0:00:29       0.9039    0.9137
         3   MinMaxScaler RandomForest                      0:00:28       0.8854    0.9137
         4   StandardScalerWrapper SGD                      0:00:26       0.8524    0.9137
         5   StandardScalerWrapper RandomForest             0:00:29       0.9005    0.9137
         6   RobustScaler ExtremeRandomTrees                0:00:31       0.8943    0.9137
         7   StandardScalerWrapper ExtremeRandomTrees       0:00:26       0.8138    0.9137
         8   StandardScalerWrapper SGD                      0:00:31       0.9041    0.9137
         9   StandardScalerWrapper SGD                      0:00:28       0.8956    0.9137
        10   MinMaxScaler SGD                               0:00:33       0.8395    0.9137

In [17]:
from azureml.widgets import RunDetails
RunDetails(automl_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [18]:
# Retrieve and save your best automl model.
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

# Get your best run and save the model from that run.
best_run, fitted_model = automl_run.get_output(metric = "accuracy")
print(best_run)


description = 'Best AutoML Model'
tags = None
model = automl_run.register_model(description = description, tags = tags)
print(automl_run.model_id)



Run(Experiment: hyperparamenter_Tuning,
Id: AutoML_efddcbd3-b43c-4430-95e8-61ec55cf5eec_28,
Type: azureml.scriptrun,
Status: Completed)
AutoMLefddcbd3b28
