# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [21]:
# import dependencies
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
#from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
#from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.19.0


In [22]:
# initialise workspace
ws = Workspace.from_config() # using from_config file downloaded from Azure ML Studio workspace
#exp = Experiment(workspace=ws, name="capstone-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

#run = exp.start_logging()

Workspace name: quick-starts-ws-131472
Azure region: southcentralus
Subscription id: d4ad7261-832d-46b2-b093-22156001df5b
Resource group: aml-quickstarts-131472


In [23]:
# create an AutoML experiment
# choose a name for experiment
experiment_name = 'ctg_experiment'
project_folder = './automlproject'

# create the experiment
experiment=Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
ctg_experiment,quick-starts-ws-131472,Link to Azure Machine Learning studio,Link to Documentation


In [24]:
# create or attach an existing compute cluster
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# set name of CPU cluster
cpu_cluster_name = "automl-cluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, using it.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

# wait for completion
cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, using it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

### Overview

This project uses the cardiotocogram data set available at the UCI ML website.
The dataset consists of measurements of fetal heart rate (FHR) and uterine contraction (UC) features on cardiotocograms classified by expert obstetricians. It consists of 2126 fetal cardiotocograms (CTGs) with 23 attributes.

The CTGs were automatically processed and the respective diagnostic features measured and classified by three expert obstetricians and a consensus classification label assigned to each of them. Classification was both with respect to a morphologic pattern (A, B, C, ..., J) and to a fetal state (N, S, P). Therefore the dataset can be used either for 10-class or 3-class experiments. 

In this work, the 3-class label is used to classify a foetus as Normal, Suspect or Pathological based on the cardiotocogram measurements.



In [26]:
# import data external to AzureML
from azureml.data.dataset_factory import TabularDatasetFactory

# Try to load the dataset from the Workspace. Otherwise, create it from the file

found = False
key = "ctg-data"
description_text = "Cardiocotogram dataset for fetal state classification Udacity Capstone Project"

if key in ws.datasets.keys(): 
        found = True
        print('found dataset in workspace')
        ds = ws.datasets[key] 

if not found:
    # Create TabularDataset using TabularDatasetFactory
    url = "https://raw.githubusercontent.com/ijeendu/Machine_Learning_in_Microsoft_Azure/main/CTG.csv"
    ds = TabularDatasetFactory.from_delimited_files(path=url)
    
    # register dataset in workspace
    ds = ds.register(workspace=ws,
                    name=key,
                    description=description_text)

# view dataset
ds.to_pandas_dataframe().head()

found dataset in workspace


Unnamed: 0,FileName,Date,SegFile,b,e,LBE,LB,AC,FM,UC,...,C,D,E,AD,DE,LD,FS,SUSP,CLASS,NSP
0,Variab10.txt,1996-12-01,CTG0001.txt,240,357,120,120,0,0,0,...,0,0,0,0,0,0,1,0,9,2
1,Fmcs_1.txt,1996-05-03,CTG0002.txt,5,632,132,132,4,0,4,...,0,0,0,1,0,0,0,0,6,1
2,Fmcs_1.txt,1996-05-03,CTG0003.txt,177,779,133,133,2,0,5,...,0,0,0,1,0,0,0,0,6,1
3,Fmcs_1.txt,1996-05-03,CTG0004.txt,411,1192,134,134,2,0,6,...,0,0,0,1,0,0,0,0,6,1
4,Fmcs_1.txt,1996-05-03,CTG0005.txt,533,1147,132,132,4,0,5,...,0,0,0,0,0,0,0,0,2,1


In [27]:
# explore dataframe
df = ds.to_pandas_dataframe()
df.describe()

Unnamed: 0,b,e,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,...,C,D,E,AD,DE,LD,FS,SUSP,CLASS,NSP
count,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,...,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0,2126.0
mean,878.44,1702.88,133.3,133.3,2.72,7.24,3.66,46.99,1.33,9.85,...,0.02,0.04,0.03,0.16,0.12,0.05,0.03,0.09,4.51,1.3
std,894.08,930.92,9.84,9.84,3.56,37.13,2.85,17.19,0.88,18.4,...,0.16,0.19,0.18,0.36,0.32,0.22,0.18,0.29,3.03,0.61
min,0.0,287.0,106.0,106.0,0.0,0.0,0.0,12.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,55.0,1009.0,126.0,126.0,0.0,0.0,1.0,32.0,0.7,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
50%,538.0,1241.0,133.0,133.0,1.0,0.0,3.0,49.0,1.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0
75%,1521.0,2434.75,140.0,140.0,4.0,2.0,5.0,61.0,1.7,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0
max,3296.0,3599.0,160.0,160.0,26.0,564.0,23.0,87.0,7.0,91.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0,3.0


In [28]:
# prepare dataset for model training
#import sys
#sys.path.insert(0,./training)

from train import clean_data
from sklearn.model_selection import train_test_split
import pandas as pd

# Clean dataset using the clean_data function
x, y = clean_data(ds)

# split dataset into train and test sets
(x_train, x_test, y_train, y_test) = train_test_split(x, y, test_size= 0.3, random_state = 0)
label = 'NSP'

# merge the output x and y dataframes into a single table for AutoML experiment
train_data_df = pd.concat([x_train, y_train], axis=1)

train_data_df.head()

Unnamed: 0,b,e,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,...,C,D,E,AD,DE,LD,FS,SUSP,CLASS,NSP
994,44,831,129,129,3,0,1,27,1.2,1,...,0,0,0,0,0,0,0,0,2,1
1618,848,2108,142,142,4,63,3,66,3.3,0,...,0,0,0,1,0,0,0,0,6,1
177,0,1033,119,119,8,0,5,30,1.8,0,...,0,0,0,0,0,0,0,0,2,1
519,25,679,126,126,5,26,3,42,1.5,0,...,0,0,0,0,0,0,0,0,2,1
745,242,1014,142,142,1,0,4,23,3.6,0,...,0,0,0,0,1,0,0,0,7,1


In [8]:
# save training data in tabular format to allow for remote run
if not os.path.isdir('data'):  # create data folder if it does not exist
    os.mkdir('data')
    
if not os.path.exists('project_folder'):  # create project folder if it does not exist
    os.makedirs('project_folder')

# Save the train data to a csv file to be uploaded to the datastore
pd.DataFrame(train_data_df).to_csv("data/train_data.csv", index=False)

# Upload the training data as a tabular dataset for access during training on remote compute
# upload to data store
ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='automlclassifier', overwrite=True, show_progress=True)

 # access datastore during training on remote compute
train_data = TabularDatasetFactory.from_delimited_files(path=ds.path('automlclassifier/train_data.csv'))


Uploading an estimated of 1 files
Uploading ./data/train_data.csv
Uploaded ./data/train_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [29]:
# Set parameters for AutoMLConfig

# define autoconfig settings
automl_settings = {
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,    
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

# define automl autconfig parameters
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task= 'classification',
    primary_metric='accuracy',
    enable_onnx_compatible_models=True,
    compute_target=cpu_cluster, # included to allow for remote compute
    training_data= train_data,
    label_column_name= label,
    path = project_folder,
    n_cross_validations=3,
    debug_log = "automl_errors.log",    
    **automl_settings)

In [30]:
#Submit experiment for remote run
automl_run = experiment.submit(automl_config, show_output = True)

Running on remote.
No run_configuration provided, running on automl-cluster with default configuration
Running on remote compute: automl-cluster
Parent Run ID: AutoML_54e35876-e6aa-4f8b-adc1-8e0f88d704e6

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+-------------------

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [31]:
# show run details
from azureml.widgets import RunDetails
RunDetails(automl_run).show()

# wait for completion
automl_run.wait_for_completion(show_output=True)




****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|118                              |3                                |1488                                  |
+---------------------------------+---------------------------------+--------------------------------------+


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more about high cardinality feature handling: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   PIPELINE                                       DURATION 

{'runId': 'AutoML_54e35876-e6aa-4f8b-adc1-8e0f88d704e6',
 'target': 'automl-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-12-23T14:22:54.023326Z',
 'endTimeUtc': '2020-12-23T14:47:19.65492Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'automl-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"d2a991cc-14e4-4850-a196-51ba69f5120b\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"automlclassifier/train_data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-131472\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"d4ad7261-832d-46b2-b093-22156001df5b\\\

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [37]:
# Retrieve the best automl run model
best_automl_run, fitted_automl_model = automl_run.get_output()
print(best_automl_run)

# get best model and display properties
model_name = best_automl_run.properties['model_name']
print("Best_model name:", model_name)

# display all the properties of the best model
#best_automl_run #.properties
best_automl_run.get_properties()

Run(Experiment: ctg_experiment,
Id: AutoML_54e35876-e6aa-4f8b-adc1-8e0f88d704e6_36,
Type: azureml.scriptrun,
Status: Completed)
Best_model name: AutoML54e35876e36


{'runTemplate': 'automl_child',
 'pipeline_id': '__AutoML_Ensemble__',
 'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'ctg_experiment\',\'compute_target\':\'automl-cluster\',\'subscription_id\':\'d4ad7261-832d-46b2-b093-22156001df5b\',\'region\':\'southcentralus\',\'spark_service\':None}","ensemble_run_id":"AutoML_54e35876-e6aa-4f8b-adc1-8e0f88d704e6_36","experiment_name":"ctg_experiment","workspace_name":"quick-starts-ws-131472","subscription_id":"d4ad7261-832d-46b2-b093-22156001df5b","resource_group_name":"aml-quickstarts-131472"}}]}',
 'training_percent': '100',
 'predicted_cost': None,
 'iteration': '36',
 '_aml_system_scenario_identification': 'Remote.Child',
 '_azureml.ComputeTarge

In [38]:
#TODO: Save the best model
#import joblib
# create an output folder
#os.makedirs('outputs', exist_ok=True)
#joblib.dump(model,'outputs/model.joblib')

# register best autoML model for future deployment
os.makedirs('outputs', exist_ok=True)

description = 'AutoML Model trained on cardiocotogram data to predict fetal state as either Normal, Suspect or Pathologic'
tags = {'area': 'neonatal health', 'type': 'classification'}
automl_model = automl_run.register_model(model_name = 'best-model', description = description, tags = tags)

print("AutoML RunID: ", automl_run.id, sep='\t')

AutoML RunID: 	AutoML_54e35876-e6aa-4f8b-adc1-8e0f88d704e6


## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [40]:
# deploy model as a web service
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core import Environment
from azureml.core.model import Model

service_name = 'my-ctg-service'
env = Environment.get(workspace=ws, name="AzureML-Tutorial")  
env.python.conda_dependencies.add_pip_package("scikit-learn")

inference_config = InferenceConfig(entry_script='score.py', environment=env)
aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=4, enable_app_insights=True, auth_enabled=True)

model = Model(ws,name='best-model')
service = Model.deploy(workspace=ws,
                       name=service_name,
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=aci_config,
                       overwrite=True)

service.wait_for_deployment(show_output=True)


Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running..........
Failed


ERROR:azureml.core.webservice.webservice:Service deployment polling reached non-successful terminal state, current service state: Failed
Operation ID: 35592cd3-3880-4e29-aa1e-5ad26f2f9b09
More information can be found using '.get_logs()'
Error:
{
  "code": "ContainerGroupTransitioning",
  "statusCode": 429,
  "message": "ACI Service request failed. Reason: The container group 'my-ctg-service-ZKwOVYnDaE_N3v8JuFRp8A' is still transitioning, please retry later.."
}



WebserviceException: WebserviceException:
	Message: Service deployment polling reached non-successful terminal state, current service state: Failed
Operation ID: 35592cd3-3880-4e29-aa1e-5ad26f2f9b09
More information can be found using '.get_logs()'
Error:
{
  "code": "ContainerGroupTransitioning",
  "statusCode": 429,
  "message": "ACI Service request failed. Reason: The container group 'my-ctg-service-ZKwOVYnDaE_N3v8JuFRp8A' is still transitioning, please retry later.."
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Service deployment polling reached non-successful terminal state, current service state: Failed\nOperation ID: 35592cd3-3880-4e29-aa1e-5ad26f2f9b09\nMore information can be found using '.get_logs()'\nError:\n{\n  \"code\": \"ContainerGroupTransitioning\",\n  \"statusCode\": 429,\n  \"message\": \"ACI Service request failed. Reason: The container group 'my-ctg-service-ZKwOVYnDaE_N3v8JuFRp8A' is still transitioning, please retry later..\"\n}"
    }
}

In [41]:
print(service.get_logs())

None


In [42]:
print(service.state)
print("scoring URI: " + service.scoring_uri)

print("Swagger URI: " + service.swagger_uri)

print("Authetication Key: " + service.get_keys())

Failed


TypeError: must be str, not NoneType

TODO: In the cell below, send a request to the web service you deployed to test it.

In [None]:
import json

input_payload = json.dumps({
    'data': dataset_x[0:2].tolist(),
    'method': 'predict'  # If you have a classification model, you can get probabilities by changing this to 'predict_proba'.
})

output = service.run(input_payload)

print(output)

TODO: In the cell below, print the logs of the web service and delete the service

In [None]:
# print logs
from azureml.core import Workspace
from azureml.core.webservice import Webservice

# Requires the config to be downloaded first to the current working directory
ws = Workspace.from_config()

# Set with the deployment name
name = "my-ctg-service"

# load existing web service
service = Webservice(name=name, workspace=ws)
logs = service.get_logs()

for line in logs.split('\n'):
    print(line)


In [None]:
#delete service
service.delete()