https://knowledge.udacity.com/questions/423888


In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-134415")
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code RZBTN7N5D to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-134415
Azure region: southcentralus
Subscription id: 5a4ab2ba-6c51-4805-8155-58759ad589d8
Resource group: aml-quickstarts-134415


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpucluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

compute_target = ws.compute_targets["cpucluster"]

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
%%writefile conda_dependencies.yml

dependencies:
- python=3.6.2
- scikit-learn
- pip:
  - azureml-defaults

Writing conda_dependencies.yml


In [4]:
from azureml.widgets import RunDetails
#from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
#from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive import normal, uniform, choice
from azureml.core import ScriptRunConfig
from azureml.core import Environment
import os

sklearn_env = Environment.from_conda_specification(name="sklearn-env", file_path="conda_dependencies.yml")

# Specify parameter sampler
ps = RandomParameterSampling({
        "C": uniform(0.05, 0.1),
        "max_iter": choice(100, 150, 200, 250, 300)
    })

# Specify a Policy
policy = BanditPolicy(slack_factor=0.1, evaluation_interval=1, delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
src = ScriptRunConfig(source_directory='.',
                      script='train.py',
                    #  arguments=['--C', 1.0, '--max_iter', 100],
                      compute_target=compute_target,
                      environment=sklearn_env
                      )

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src, 
                                     hyperparameter_sampling=ps, 
                                     policy=policy, 
                                     primary_metric_name="Accuracy", 
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                                     max_total_runs=100, 
                                     max_concurrent_runs=4)

In [5]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
from azureml.widgets import RunDetails

hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

https://knowledge.udacity.com/questions/384083

In [10]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])
print(f"best run ID: {best_run.id}")
print(f"Accuracy: {best_run.get_metrics()['Accuracy']}")

# Save the model
best_run.download_file("/outputs/model_hd.joblib", "Logistic_reg_hd_bankmarketing.joblib")

['--C', '0.09213114549724909', '--max_iter', '100']
best run ID: HD_324c4c2c-4318-44f2-8205-93c726a23101_66
Accuracy: 0.9156297420333839


In [11]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
ds = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")

In [12]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [13]:
from sklearn.model_selection import train_test_split
import pandas as pd

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=0)

In [14]:
train_data = pd.concat([x_train, y_train], axis=1)
# train_data.to_csv(local_path)

In [15]:
# from azureml.core import Dataset

local_path = "prepared"
datastore = ws.get_default_datastore()
datastore.upload(src_dir=".", target_path="data")

# training_dataset = Dataset.Tabular.from_delimited_files(path=[(datastore, local_path)])
training_dataset = TabularDatasetFactory.register_pandas_dataframe(train_data, (datastore, local_path), "train_dataset")

Uploading an estimated of 8 files
Uploading ./conda_dependencies.yml
Uploaded ./conda_dependencies.yml, 1 files out of an estimated total of 8
Uploading ./Logistic_reg_hd_bankmarketing.joblib
Uploaded ./Logistic_reg_hd_bankmarketing.joblib, 2 files out of an estimated total of 8
Uploading ./train.py
Uploaded ./train.py, 3 files out of an estimated total of 8
Uploading ./udacity-project.ipynb
Uploaded ./udacity-project.ipynb, 4 files out of an estimated total of 8
Uploading ./udacity-project.ipynb.amltemp
Uploaded ./udacity-project.ipynb.amltemp, 5 files out of an estimated total of 8
Uploading ./.ipynb_aml_checkpoints/udacity-project-checkpoint2021-0-12-2-3-56.ipynb
Uploaded ./.ipynb_aml_checkpoints/udacity-project-checkpoint2021-0-12-2-3-56.ipynb, 6 files out of an estimated total of 8
Uploading ./.ipynb_aml_checkpoints/udacity-project-checkpoint2021-0-12-3-16-45.ipynb
Uploaded ./.ipynb_aml_checkpoints/udacity-project-checkpoint2021-0-12-3-16-45.ipynb, 7 files out of an estimated tota

Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


In [16]:
training_dataset

{
  "source": [
    "('workspaceblobstore', 'prepared/4ac87972-f284-4eb1-97cf-59b1989a84e4/')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ],
  "registration": {
    "id": "7b0a7187-2e0a-45d1-a82b-a36935cac26a",
    "name": "train_dataset",
    "version": 1,
    "workspace": "Workspace.create(name='quick-starts-ws-134415', subscription_id='5a4ab2ba-6c51-4805-8155-58759ad589d8', resource_group='aml-quickstarts-134415')"
  }
}

https://knowledge.udacity.com/questions/374305

In [43]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='AUC_weighted',
    compute_target=compute_target,
    training_data=training_dataset,
    # X=x_train,
    # y=y_train,
    label_column_name="y",
    n_cross_validations=2)

In [44]:
# Submit your automl run

### YOUR CODE HERE ###
# Choose a name for the experiment and specify the project folder.
experiment_name = 'udacity-project-automl'
# project_folder = 'automl-classification'

experiment = Experiment(ws, experiment_name)

In [45]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
run = experiment.submit(automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on cpucluster with default configuration
Running on remote compute: cpucluster
Parent Run ID: AutoML_624d47dd-b453-42c3-85cd-319c5728199f

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------

In [49]:
# from azureml.automl.runtime.onnx_convert import OnnxConverter

best_run, best_model = run.get_output()
best_run.register_model(model_name="automl_best_model.pkl", model_path="./outputs/")

Model(workspace=Workspace.create(name='quick-starts-ws-134271', subscription_id='f9d5a085-54dc-4215-9ba6-dad5d86e60a0', resource_group='aml-quickstarts-134271'), name=automl_best_model.pkl, id=automl_best_model.pkl:1, version=1, tags={}, properties={})

In [None]:
cpu_cluster.delete()