# Automated ML

Import Dependencies.

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


In [2]:
!pip install -U imbalanced-learn

Requirement already up-to-date: imbalanced-learn in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (0.7.0)


In [3]:

from azureml.core import Workspace, Experiment

ws = Workspace.from_config()

experiment = Experiment(workspace=ws, name="creditcardfraud")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: quick-starts-ws-136268
Azure region: southcentralus
Subscription id: 2c48c51c-bd47-40d4-abbe-fb8eabd19c8c
Resource group: aml-quickstarts-136268


## Dataset

### Overview
Cardiovascular diseases (CVDs) are the number 1 cause of death globally, taking an estimated 17.9 million lives each year, which accounts for 31% of all deaths worlwide.
Heart failure is a common event caused by CVDs and this dataset contains 12 features that can be used to predict mortality by heart failure.

The dataset was downloaded from [kaggle](https://www.kaggle.com/andrewmvd/heart-failure-clinical-data) and a copy from the dataset was saved into the [git repository](https://github.com/heber-augusto/Nanodegree_Azure_ML_Engineer_CapstoneProject/tree/master/data).


In [18]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "creditcard"
description_text = "Creditcard dataset"
label = "Class"
if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        print("download data from https://www.kaggle.com/mlg-ulb/creditcardfraud/download and create a dataset")
        # Create AML Dataset and register it into Workspace
        example_data = 'https://raw.githubusercontent.com/heber-augusto/Nanodegree_Azure_ML_Engineer_CapstoneProject/master/data/creditcard.csv.zip'
        dataset = Dataset.Tabular.from_delimited_files(example_data)
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)        


df = dataset.to_pandas_dataframe()
df.describe()

In [6]:
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### Create cluster to run AutoML

In [7]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "cpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                           min_nodes=1,
                                                           max_nodes=6)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

# can poll for a minimum number of nodes and for a specific timeout. 
# if no min node count is provided it uses the scale settings for the cluster
compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing compute target
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 1, 'targetNodeCount': 1, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 1, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-01-27T14:14:24.576000+00:00', 'errors': None, 'creationTime': '2021-01-27T14:12:22.515079+00:00', 'modifiedTime': '2021-01-27T14:12:37.951943+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 1, 'maxNodeCount': 6, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [8]:
import os

project_folder = './fraud-detection'
os.makedirs(project_folder, exist_ok=True)

### Split dataset into train and test

In [14]:
from sklearn.model_selection import train_test_split
import pandas as pd
from azureml.core.dataset import Dataset
from train import clean_data
from imblearn.over_sampling import SMOTE


# Get label and features into separate dataframes
x, y = clean_data(dataset)

# Split data into train and test sets.
x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y , 
    test_size=0.33, 
    random_state=42)

# transform the dataset
oversample = SMOTE()
x_train,y_train = oversample.fit_resample(x_train, y_train)

training_data=pd.concat([x_train,y_train], axis=1)
testing_data=pd.concat([x_test,y_test], axis=1)


if not os.path.isdir('data'):
    os.mkdir('data')
    
# Save the train data to a csv to be uploaded to the datastore
pd.DataFrame(training_data).to_csv("data/train_data.csv", index=False)
pd.DataFrame(testing_data).to_csv("data/test_data.csv", index=False)

ds = ws.get_default_datastore()
ds.upload(
    src_dir='./data', 
    target_path='fraud-detection', 
    overwrite=True, 
    show_progress=True)

# Upload the training data as a tabular dataset for access during training on remote compute
train_data = Dataset.Tabular.from_delimited_files(path=ds.path('fraud-detection/train_data.csv'))

test_data = Dataset.Tabular.from_delimited_files(path=ds.path('fraud-detection/test_data.csv'))


KeyError: 'DEATH_EVENT'

## AutoML Configuration

The experiment timeout was set to 1h to avoid losing work inside Udacity workspace (wich has time limit).

The max concurrent interations was set to 5 because it has to be less than the max nodes from cluster (which is 6).

The accuracy was set as a primary metric to compare with HyperDrive Run.


In [34]:
automl_settings = {
    "experiment_timeout_hours": 1,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'accuracy'
}

automl_config = AutoMLConfig(
    task='classification',
    compute_target=compute_target,
    enable_onnx_compatible_models=True,
    training_data=train_data,
    validation_data=test_data,
    label_column_name=label,   
    path = project_folder,
    enable_early_stopping= True,
    featurization= 'auto',
    debug_log = "automl_errors.log",
    **automl_settings)




In [35]:
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output'

metrics_data = PipelineData(name='metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

In [36]:
automl_step = AutoMLStep(
    name='automl_module',
    automl_config=automl_config,
    outputs=[metrics_data, model_data],
    allow_reuse=True)

In [37]:
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(
    description="pipeline_with_automlstep",
    workspace=ws,    
    steps=[automl_step])

In [39]:
pipeline_run = experiment.submit(pipeline)

Submitted PipelineRun 3b713be1-92a9-467e-98ce-582012878c33
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/heart_failure/runs/3b713be1-92a9-467e-98ce-582012878c33?wsid=/subscriptions/48a74bb7-9950-4cc1-9caa-5d50f995cc55/resourcegroups/aml-quickstarts-135475/workspaces/quick-starts-ws-135475


## Run Details

In [40]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

In [42]:
pipeline_run.wait_for_completion()

PipelineRunId: 3b713be1-92a9-467e-98ce-582012878c33
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/heart_failure/runs/3b713be1-92a9-467e-98ce-582012878c33?wsid=/subscriptions/48a74bb7-9950-4cc1-9caa-5d50f995cc55/resourcegroups/aml-quickstarts-135475/workspaces/quick-starts-ws-135475
PipelineRun Status: Running


StepRunId: 80364351-4c72-42da-8b62-cca066a658b6
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/heart_failure/runs/80364351-4c72-42da-8b62-cca066a658b6?wsid=/subscriptions/48a74bb7-9950-4cc1-9caa-5d50f995cc55/resourcegroups/aml-quickstarts-135475/workspaces/quick-starts-ws-135475
StepRun( automl_module ) Status: Running


## Best Model

Get the best model from the automl experiments and display all the properties of the model.



In [48]:
metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)
num_file_downloaded = metrics_output.download('.', show_progress=True)

Downloading azureml/2e966f0e-0ca1-408d-97b4-f8f8ec49ad90/metrics_data
Downloaded azureml/2e966f0e-0ca1-408d-97b4-f8f8ec49ad90/metrics_data, 1 files out of an estimated total of 1


In [49]:

import json
with open(metrics_output._path_on_datastore) as f:
    metrics_output_result = f.read()
    
deserialized_metrics_output = json.loads(metrics_output_result)
df = pd.DataFrame(deserialized_metrics_output)
df

Unnamed: 0,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_5,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_15,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_16,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_18,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_23,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_35,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_32,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_6,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_3,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_10,...,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_27,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_30,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_17,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_1,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_20,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_25,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_24,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_19,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_21,2e966f0e-0ca1-408d-97b4-f8f8ec49ad90_33
weighted_accuracy,[0.7582286056253739],[0.7558348294434472],[0.7785757031717535],[0.7384799521244765],[0.800718132854578],[0.6193895870736085],[0.8067025733093955],[0.7923399162178336],[0.786953919808498],[0.7588270496708558],...,[0.8150807899461401],[0.6481149012567324],[0.7983243566726511],[0.7779772591262717],[0.7540394973070018],[0.7761819269898265],[0.7779772591262718],[0.7289048473967685],[0.7929383602633154],[0.7893476959904249]
precision_score_macro,[0.7406716417910448],[0.7430555555555556],[0.778108108108108],[0.7175839552238805],[0.7934782608695652],[0.4516129032258065],[0.8121707060063224],[0.7852216748768472],[0.78609062170706],[0.7513513513513514],...,[0.8194444444444444],[0.2878787878787879],[0.8048648648648649],[0.7637593283582089],[0.8],[0.7981012658227848],[0.7637593283582089],[0.7801724137931034],[0.8116883116883117],[0.7782258064516129]
f1_score_weighted,[0.7302184882830044],[0.7134846952844931],[0.7318231880155089],[0.709466064304774],[0.7698412698412698],[0.4422222222222222],[0.7653092006033183],[0.7582982985001394],[0.7439736733854382],[0.7103690430567495],...,[0.7771547629990501],[0.4207459207459208],[0.753277332974268],[0.7509709122612348],[0.665739138303528],[0.7120205853602437],[0.7509709122612348],[0.6206509539842873],[0.7377883310719132],[0.760395184723952]
recall_score_macro,[0.7155388471177945],[0.6973684210526316],[0.7149122807017544],[0.6948621553884712],[0.7537593984962405],[0.4887218045112782],[0.7474937343358397],[0.7418546365914787],[0.7268170426065164],[0.694235588972431],...,[0.7593984962406015],[0.5],[0.7355889724310777],[0.7362155388471178],[0.6578947368421052],[0.6967418546365914],[0.7362155388471178],[0.6221804511278195],[0.7205513784461153],[0.7449874686716792]
accuracy,[0.7373737373737373],[0.7272727272727273],[0.7474747474747475],[0.7171717171717171],[0.7777777777777778],[0.5555555555555556],[0.7777777777777778],[0.7676767676767676],[0.7575757575757576],[0.7272727272727273],...,[0.7878787878787878],[0.5757575757575758],[0.7676767676767676],[0.7575757575757576],[0.7070707070707071],[0.7373737373737373],[0.7575757575757576],[0.6767676767676768],[0.7575757575757576],[0.7676767676767676]
average_precision_score_micro,[0.8431052591353935],[0.8308158967821206],[0.8486397872237088],[0.8073153403172525],[0.8502207628671725],[0.5590552858779526],[0.8582085684669625],[0.8806623627003037],[0.8141639322257993],[0.7711253382296628],...,[0.8761097168893048],[0.5436179981634528],[0.8664609973825761],[0.8852502995906664],[0.8074807055432183],[0.7403968011211306],[0.7755781216736037],[0.8016523365882712],[0.839420284284454],[0.8680613735919829]
log_loss,[0.6908732155300774],[0.5288721182213216],[0.49940617263519893],[0.5702258400052395],[0.5891119714328706],[0.763112383415178],[0.47473929536675824],[0.4551879179518784],[0.5326260159471139],[0.5903168342919255],...,[0.47700599698031065],[0.7300929715928547],[0.46832995932441845],[0.5402727353471508],[0.5582939382152808],[0.5774499621517114],[0.538263089055449],[0.5730459903474117],[0.5108737130598923],[0.49533962684805416]
precision_score_weighted,[0.7392582541836273],[0.7377946127946127],[0.7687305487305487],[0.7174072817729534],[0.7874396135265701],[0.46953405017921146],[0.8011942395504039],[0.7788028063890133],[0.7769901331545167],[0.743980343980344],...,[0.8089225589225588],[0.3314967860422406],[0.7934807534807534],[0.7611092265943011],[0.7797979797979798],[0.7826620636747219],[0.7611092265943011],[0.7594914663880181],[0.7969303423848878],[0.7739491691104595]
AUC_macro,[0.8441938178780284],[0.8373015873015872],[0.8859649122807018],[0.8114035087719298],[0.8583959899749374],[0.5041771094402674],[0.8805346700083543],[0.8951545530492899],[0.8429406850459482],[0.7769423558897244],...,[0.8836675020885547],[0.5],[0.9010025062656641],[0.8897243107769424],[0.8734335839598997],[0.8032581453634084],[0.8114035087719298],[0.87468671679198],[0.8876357560568087],[0.8609022556390977]
precision_score_micro,[0.7373737373737373],[0.7272727272727273],[0.7474747474747475],[0.7171717171717171],[0.7777777777777778],[0.5555555555555556],[0.7777777777777778],[0.7676767676767676],[0.7575757575757576],[0.7272727272727273],...,[0.7878787878787878],[0.5757575757575758],[0.7676767676767676],[0.7575757575757576],[0.7070707070707071],[0.7373737373737373],[0.7575757575757576],[0.6767676767676768],[0.7575757575757576],[0.7676767676767676]


In [50]:
# Retrieve best model from Pipeline Run
best_model_output = pipeline_run.get_pipeline_output(best_model_output_name)
num_file_downloaded = best_model_output.download('.', show_progress=True)

Downloading azureml/2e966f0e-0ca1-408d-97b4-f8f8ec49ad90/model_data
Downloaded azureml/2e966f0e-0ca1-408d-97b4-f8f8ec49ad90/model_data, 1 files out of an estimated total of 1


In [51]:
import pickle

with open(best_model_output._path_on_datastore, "rb" ) as f:
    best_model = pickle.load(f)
best_model

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               objective='reg:logistic',
                                                                                               random_state=0,
                                                                                               reg_alpha=0,
                                                    

In [52]:

from sklearn.metrics import confusion_matrix
import pandas as pd
ypred = best_model.predict(x_test)
cm = confusion_matrix(y_test, ypred)
# Visualize the confusion matrix
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

Unnamed: 0,0,1
0,54,3
1,15,27


## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

TODO: In the cell below, send a request to the web service you deployed to test it.

TODO: In the cell below, print the logs of the web service and delete the service