In [90]:
import pandas as pd
import logging

import os
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.compute import AksCompute, AmlCompute, ComputeTarget
from azureml.core import Datastore
from azureml.core.runconfig import DataReferenceConfiguration
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.41


In [91]:
subscription_id = "36cfc6d6-79ca-4642-b263-93d6eaa4a823"
resource_group = "demo-aml-use"
workspace_name = "demoaml"
workspace_region = "eastus"

In [92]:
try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config()
    print("Workspace configuration succeeded. Skip the workspace creation steps below")
except:
    print("Workspace not accessible. Change your parameters or create a new workspace below")

Workspace configuration succeeded. Skip the workspace creation steps below


In [93]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, sep='\t')

demoaml	eastus	demo-aml-use


In [94]:
# Choose a name for your training cluster.
amlcompute_cluster_name = "traincluster"
experiment_name = 'diabetes-classification'
project_folder = './project-temp-files'

image_name = "diabclassprob"

In [95]:
found = False

# Check if this compute target already exists in the workspace.

cts = ws.compute_targets
if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':
    found = True
    print('Found existing compute target.')
    compute_target = cts[amlcompute_cluster_name]

if not found:
    print('Creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2", # for GPU, use "STANDARD_NC6"
                                                                #vm_priority = 'lowpriority', # optional
                                                                max_nodes = 2)

    # Create the cluster.\n",
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)

    # Can poll for a minimum number of nodes and for a specific timeout.
    # If no min_node_count is provided, it will use the scale settings for the cluster.
    compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)

     # For a more detailed view of current AmlCompute status, use get_status().

Found existing compute target.


In [96]:
# Choose a name for the experiment and specify the project folder.
experiment = Experiment(ws, experiment_name)

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace Name'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
output['Experiment Name'] = experiment.name
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Unnamed: 0,Unnamed: 1
SDK version,1.0.41
Subscription ID,36cfc6d6-79ca-4642-b263-93d6eaa4a823
Workspace Name,demoaml
Resource Group,demo-aml-use
Location,eastus
Project Directory,./project-temp-files
Experiment Name,diabetes-classification


In [97]:
data_folder = os.path.join(os.getcwd(),'data')
data_file = os.path.join(data_folder, 'diabetes_classification_dataset.csv')
print(data_folder)
print(data_file)

df = pd.read_csv(data_file)
df.head()

/data/home/isinghrana/notebooks/mysamples/azuremachinelearning/diabetes-classification/data
/data/home/isinghrana/notebooks/mysamples/azuremachinelearning/diabetes-classification/data/diabetes_classification_dataset.csv


Unnamed: 0,pregnancies,plasma glucose,blood pressure,triceps skin thickness,insulin,bmi,diabetes pedigree,age,diabetes
0,6,148,72,35,0,33.6,0.63,50,1
1,1,85,66,29,0,26.6,0.35,31,0
2,8,183,64,0,0,23.3,0.67,32,1
3,1,89,66,23,94,28.1,0.17,21,0
4,0,137,40,35,168,43.1,2.29,33,1


In [98]:
ds = ws.get_default_datastore()
ds.upload(src_dir=data_folder, target_path='diabetes_classification', overwrite=True, show_progress=True)

Uploading /data/home/isinghrana/notebooks/mysamples/azuremachinelearning/diabetes-classification/data/diabetes_classification_dataset.csv
Uploaded /data/home/isinghrana/notebooks/mysamples/azuremachinelearning/diabetes-classification/data/diabetes_classification_dataset.csv, 1 files out of an estimated total of 1


$AZUREML_DATAREFERENCE_13ece8080b2c45c698af71fd5c67e26c

In [99]:
dr = DataReferenceConfiguration(datastore_name=ds.name, 
                   path_on_datastore='diabetes-classification', 
                   path_on_compute='/tmp/azureml_runs',
                   mode='download', # download files from datastore to compute target
                   overwrite=False)

In [102]:
# create a new RunConfig object
conda_run_config = RunConfiguration(framework="python")

# Set compute target to AmlCompute
conda_run_config.target = compute_target
conda_run_config.environment.docker.enabled = True
conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE

# set the data reference of the run coonfiguration
conda_run_config.data_references = {ds.name: dr}

cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])
conda_run_config.environment.python.conda_dependencies = cd

In [103]:
# Create a project_folder if it doesn't exist
if not os.path.exists(project_folder):
    os.makedirs(project_folder)


In [104]:
%%writefile ./project-temp-files/get_data.py
import pandas as pd
import os

def get_data():     
    df = pd.read_csv("/tmp/azureml_runs/diabetes-classification/diabetes_classification_dataset.csv")
    print('after pd.read_csv')    
    # get integer labels
    y = df["diabetes"]
    df = df.drop("diabetes", axis=1)    
    return { "X" : df, "y" : y.values }

Overwriting ./project-temp-files/get_data.py


In [105]:
automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             path = project_folder,
                             #compute_target = compute_target,
                             run_configuration=conda_run_config,
                             data_script = project_folder + "/get_data.py",
                             iteration_timeout_minutes = 10,
                             iterations = 10,
                             n_cross_validations = 5,
                             primary_metric = 'AUC_weighted',
                             preprocess = True,
                             max_concurrent_iterations = 2,
                             verbosity= logging.INFO
                            )

In [106]:
remote_run = experiment.submit(automl_config, show_output = False)

In [107]:
remote_run

Experiment,Id,Type,Status,Details Page,Docs Page
diabetes-classification,AutoML_6318397b-0588-4989-b5f6-6e62d0ceecf6,automl,Starting,Link to Azure Portal,Link to Documentation


In [108]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [109]:
# Wait until the run finishes.
remote_run.wait_for_completion(show_output = True)


****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   PIPELINE                                       DURATION      METRIC      BEST
         1   StandardScalerWrapper SGD                      0:01:26       0.8275    0.8275
         0   StandardScalerWrapper SGD                      0:00:49       0.8299    0.8299
         3   StandardScalerWrapper SGD                      0:01:11       0.8266    0.8299
         2   MinMaxScaler LightGBM                          0:01:16       0.8301    0.8301
         5   StandardScalerWrapper LightGBM                 0:00:56       0.8368    

{'runId': 'AutoML_6318397b-0588-4989-b5f6-6e62d0ceecf6',
 'target': 'traincluster',
 'status': 'Completed',
 'startTimeUtc': '2019-06-25T19:32:11.581861Z',
 'endTimeUtc': '2019-06-25T19:39:59.773587Z',
 'properties': {'num_iterations': '10',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'MaxTimeSeconds': '600',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'traincluster',
  'DataPrepJsonString': None,
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.0.33", "azureml-train": "1.0.41", "azureml-train-restclients-hyperdrive": "1.0.41", "azureml-train-core": "1.0.41", "azureml-train-automl": "1.0.41", "azureml-telemetry": "1.0.41", "azureml-sdk": "1.0.41", "azureml-pipeline": "1.0.41", "azureml-pipeline-steps": "1.0.41", "azureml-pipeline-core": "1.0.41

In [111]:
best_run, fitted_model = remote_run.get_output()

In [112]:
fitted_model.named_steps['datatransformer'].get_engineered_feature_names()

['pregnancies_MeanImputer',
 'plasma glucose_MeanImputer',
 'blood pressure_MeanImputer',
 'triceps skin thickness_MeanImputer',
 'insulin_MeanImputer',
 'bmi_MeanImputer',
 'diabetes pedigree_MeanImputer',
 'age_MeanImputer']

In [113]:
fitted_model.named_steps['datatransformer'].get_featurization_summary()

[{'RawFeatureName': 'pregnancies',
  'TypeDetected': 'Numeric',
  'Dropped': 'No',
  'EngineeredFeatureCount': 1,
  'Tranformations': ['MeanImputer']},
 {'RawFeatureName': 'plasma glucose',
  'TypeDetected': 'Numeric',
  'Dropped': 'No',
  'EngineeredFeatureCount': 1,
  'Tranformations': ['MeanImputer']},
 {'RawFeatureName': 'blood pressure',
  'TypeDetected': 'Numeric',
  'Dropped': 'No',
  'EngineeredFeatureCount': 1,
  'Tranformations': ['MeanImputer']},
 {'RawFeatureName': 'triceps skin thickness',
  'TypeDetected': 'Numeric',
  'Dropped': 'No',
  'EngineeredFeatureCount': 1,
  'Tranformations': ['MeanImputer']},
 {'RawFeatureName': 'insulin',
  'TypeDetected': 'Numeric',
  'Dropped': 'No',
  'EngineeredFeatureCount': 1,
  'Tranformations': ['MeanImputer']},
 {'RawFeatureName': 'bmi',
  'TypeDetected': 'Numeric',
  'Dropped': 'No',
  'EngineeredFeatureCount': 1,
  'Tranformations': ['MeanImputer']},
 {'RawFeatureName': 'diabetes pedigree',
  'TypeDetected': 'Numeric',
  'Dropped': 

In [114]:
model = best_run.register_model(model_name = 'diabclassmodel', model_path= 'outputs/model.pkl')

In [115]:
%%writefile score.py
# Scoring Script will need model id from registered model
import json
import numpy as np
import os
import pickle
import pandas as pd
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression

from azureml.core.model import Model
from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
from inference_schema.parameter_types.pandas_parameter_type import PandasParameterType

import azureml.train.automl

def init():
    global model
    # retreive the path to the model file using the model name
    model_path = Model.get_model_path('diabclassmodel') # update this based on previously registered model
    print(model_path)
    model = joblib.load(model_path)

input_dict = {
    "pregnancies": [6],
    "plasma glucose": [148] ,
    "blood pressure": [72],
    "triceps skin thickness": [35],
    "insulin": [0],
    "bmi": [33.6],
    "diabetes pedigree": [0.627],
    "age": [50]
}

output_dict = {
    "prediction": [1],
    "probability": [.89]
}

input_sample = pd.DataFrame(input_dict)
output_sample =  pd.DataFrame(output_dict)
#output_sample = np.array([("1",.90), ("0",.84)])

@input_schema('data', PandasParameterType(input_sample))
@output_schema(PandasParameterType(output_sample))
def run(data):
    # grab and prepare the data
    # make prediction
    try:
        print('inside the method')                      
        result_df = pd.DataFrame(columns = ["prediction","probability"]) 
        
        pred = model.predict(data)
        prob = model.predict_proba(data)
        
        print(pred)
        print(prob)
                
        for idx,val in enumerate(pred):
            print("index:",idx, "value:", val)
            print(val)
            print(prob[idx][int(val)])
            result_df = result_df.append({"prediction": val, "probability": prob[idx][int(val)]}, ignore_index=True)
            
    except Exception as e:
        print("Exception Caught")
        print(str(e))
        return ["exception", str(e)]    
    str = result_df.to_json(orient = 'records')    
    return json.loads(str)

Writing score.py


In [116]:
myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn'], pip_packages=['azureml-sdk[automl]', 'inference-schema[numpy-support,pandas-support]'])

conda_env_file_name = 'mydeployenv.yml'
myenv.save_to_file('.', conda_env_file_name)

'mydeployenv.yml'

In [117]:
from azureml.core.image import Image, ContainerImage

image_config = ContainerImage.image_configuration(runtime= "python",
                                 execution_script="score.py",
                                 conda_file="mydeployenv.yml",
                                 tags = {'area': "diabetes", 'type': "classification"},
                                 description = "Diabetes Classification with probability implemented using AutoML")

image = Image.create(name = image_name,
                     # this is the model object. note you can pass in 0-n models via this list-type parameter
                     # in case you need to reference multiple models, or none at all, in your scoring script.
                     models = [model],
                     image_config = image_config, 
                     workspace = ws)

Creating image


In [118]:
image.wait_for_creation(show_output = True)

Running.........................................
Succeeded
Image creation operation finished for image diabclassprob:6, operation "Succeeded"
