## Azure workspace

In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name = "quick-starts-ws-133534")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')


Workspace name: quick-starts-ws-133534
Azure region: southcentralus
Subscription id: 3e42d11f-d64d-4173-af9b-12ecaa1030b3
Resource group: aml-quickstarts-133534


### Creating (or selecting) compute cluster

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

try:
    aml_compute = ComputeTarget(workspace = ws, name = "basic-compute-project")
    print("Found existing compute...")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size = 'Standard_D2_V2', max_nodes = 4)
    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)
    
aml_compute.wait_for_completion(show_output = True)

Found existing compute...

Running


## Creating an experiment and start logging

In [3]:
exp = Experiment(workspace=ws, name = "udacity-project-1")
run = exp.start_logging()

## Using hyperdrive with the regression model (script)

In [27]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, quniform
import os

# Specify parameter sampler
ps = RandomParameterSampling ({'--C': uniform(0.1, 1), 
                              '--max_iter': quniform(100, 1500,100),})
### YOUR CODE HERE ###

# Specify a Policy
policy = BanditPolicy(evaluation_interval = 3, slack_factor = 0.15) ### YOUR CODE HERE ###

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(".", compute_target= aml_compute, entry_script = "train.py") ### YOUR CODE HERE ###

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
                            estimator = est,
                            hyperparameter_sampling = ps, 
                            policy = policy, 
                            primary_metric_name = 'Accuracy',
                            primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                            max_total_runs = 12,
                            max_concurrent_runs = 4,)



In [28]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
run_hyperdrive = exp.submit(config = hyperdrive_config)



In [29]:
RunDetails(run_hyperdrive).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [30]:
import joblib

# Get your best run and save the model from that run.
best_run = run_hyperdrive.get_best_run_by_primary_metric()

print(best_run.get_details()['runDefinition']['arguments'])

['--C', '0.5726483468191769', '--max_iter', '500']


In [31]:
type(best_run)

azureml.core.run.Run

In [32]:
model = best_run.register_model(model_name = 'bankmarketing-logistic-regresion', model_path = 'outputs/bankmarketing-logistic-model.joblib')

model.download(target_dir = 'outputs', exist_ok = True)

'outputs/bankmarketing-logistic-model.joblib'

## Evaluating the model

In [33]:
from train import clean_data
from azureml.data.dataset_factory import TabularDatasetFactory

# Using the test set from the bankmarketing data
factory = TabularDatasetFactory()
test_data_url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_test.csv"
test_ds = factory.from_delimited_files(test_data_url)

X_test, y_test = clean_data(test_ds)

logistic_model = joblib.load('outputs/bankmarketing-logistic-model.joblib')

print(logistic_model.score(X_test, y_test))

0.9109223300970873


The sklearn.linear_model.logistic module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.linear_model. Anything that cannot be imported from sklearn.linear_model is now part of the private API.
Trying to unpickle estimator LogisticRegression from version 0.20.3 when using version 0.22.2.post1. This might lead to breaking code or invalid results. Use at your own risk.


## AutoML

In [34]:
exp_automl = Experiment(workspace=ws, name="udacity-project-1-automl")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: quick-starts-ws-133534
Azure region: southcentralus
Resource group: aml-quickstarts-133534


In [35]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
datastore = ws.get_default_datastore()
factory = TabularDatasetFactory()
data_path_train = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
data_path_valid = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv"


ds_train = factory.from_delimited_files(data_path_train)
ds_valid = factory.from_delimited_files(data_path_valid)


In [None]:
#from train import clean_data

# Use the clean_data function to clean your data.
#x, y = clean_data(ds_train)

In [39]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    compute_target = aml_compute,
    experiment_exit_score = 0.93,
    blocked_models = ['KNN', 'LinearSVM'],
    primary_metric='accuracy',
    training_data=ds_train,
    label_column_name='y',
    validation_data = ds_valid,
    featurization = 'auto')
    #n_cross_validations=2)

In [40]:
# Submit your automl run

remote_run = exp_automl.submit(automl_config, show_output = False)

Running on remote.


In [41]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [42]:
# Retrieve and save your best automl model.
best_run_aml, fitted_model_aml = remote_run.get_output()

model_name = best_run_aml.properties['model_name']

In [43]:
best_run_aml.download_file('outputs/model.pkl', 'outputs/bankmarketing-aml-best-model.pkl')

## Evaluating the model locally with test set

In [44]:
import pickle

f = open('outputs/bankmarketing-aml-best-model.pkl', 'rb')
aml_model = pickle.load(f)
f.close()

In [52]:
ds_train.to_pandas_dataframe()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,57,technician,married,high.school,no,no,yes,cellular,may,mon,...,1,999,1,failure,-1.80,92.89,-46.20,1.30,5099.10,no
1,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,...,2,999,0,nonexistent,1.10,93.99,-36.40,4.86,5191.00,no
2,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.80,92.89,-46.20,1.31,5099.10,no
3,36,admin.,married,high.school,no,no,no,telephone,jun,fri,...,4,999,0,nonexistent,1.40,94.47,-41.80,4.97,5228.10,no
4,27,housemaid,married,high.school,no,yes,no,cellular,jul,fri,...,2,999,0,nonexistent,1.40,93.92,-42.70,4.96,5228.10,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32945,56,housemaid,married,basic.4y,no,no,yes,cellular,jul,mon,...,1,999,0,nonexistent,1.40,93.92,-42.70,4.96,5228.10,no
32946,37,management,married,university.degree,no,no,yes,cellular,jul,fri,...,7,999,0,nonexistent,1.40,93.92,-42.70,4.96,5228.10,no
32947,26,admin.,single,university.degree,no,no,no,cellular,may,tue,...,4,999,1,failure,-1.80,92.89,-46.20,1.27,5099.10,no
32948,31,blue-collar,single,basic.9y,no,no,no,cellular,apr,mon,...,1,999,0,nonexistent,-1.80,93.08,-47.10,1.41,5099.10,no


In [58]:
data_path_test = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_test.csv"

ds_test = factory.from_delimited_files(data_path_test)

df_test = ds_test.to_pandas_dataframe()

X_test = df_test.drop(['y'], axis = 1)
y_test = df_test['y']

In [62]:
from sklearn.metrics import accuracy_score

print(accuracy_score(aml_model.predict(X_test), y_test))

0.9189320388349514


## Clean up

In [63]:
try:
    aml_compute_delete()
    print('Computetarget deleted')
except:
    print('Computetarget not found')

NameError: name 'ComputeTargetException' is not defined