In [1]:
import azureml.core
from azureml.core import Workspace, Dataset, Datastore
from azureml.core import Experiment
from azureml.core.compute import ComputeTarget
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails

import pandas as pd
import numpy as np

print("SDK version:", azureml.core.VERSION)

SDK version: 1.40.0


In [2]:
# Set (User Managed) Local Environment up
from azureml.core import Environment

# Editing a run configuration property on-fly.
user_managed_env = Environment("user-managed-env")

user_managed_env.python.user_managed_dependencies = True

# You can choose a specific Python environment by pointing to a Python path 
#user_managed_env.python.interpreter_path = '/home/johndoe/miniconda3/envs/myenv/bin/python'

In [3]:
from azureml.core.authentication import InteractiveLoginAuthentication

# Get the Workspace object from Azure
# You can find tenant id under azure active directory->properties
tenant_id = '198c7d8c-e010-45ce-a018-ec2d9a33f58f'
ia = InteractiveLoginAuthentication(tenant_id=tenant_id)
ws_name = 'automlbook'
subscription_id = '4d278f3d-b4fd-4fa2-86b6-d34b96bc888f'
resource_group = 'Foxy_Resources'
ws = Workspace.get(name=ws_name,
                   subscription_id=subscription_id,
                   resource_group=resource_group,
                   auth=ia)
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

automlbook
Foxy_Resources
centralus
4d278f3d-b4fd-4fa2-86b6-d34b96bc888f


In [4]:
# Create datastore, try getting datastore via Workspace object
datastore = Datastore.get_default(ws)
datastore_name = 'workspaceblobstore'
datastore = Datastore.get(ws, datastore_name)

In [5]:
# Create a dataset from the datastore of the Workspace
dataset_name = 'automlbook Titanic Training Data A'
# dataset = Dataset.get_by_name(ws, dataset_name)
dataset = Dataset.get_by_name(ws, dataset_name, version = 'latest')
dataset_columns = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']

# Show a sample of the data in the dataset
dataset.take(10).to_pandas_dataframe()

# Turn Dataset into Pandas Dataframe, it is to be preprocessed
df = dataset.to_pandas_dataframe()

In [6]:
# Preprocess numeric columns
from sklearn.preprocessing import StandardScaler

df_column_names = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']
df_numeric_column_names = ['Age', 'Fare']

# BEGIN Create dataframe with numeric columns so that it contains all numbers that are preprocessed...

# For int column Age, Impute NaN numeric values, and Remove outliers
print('Before Removing outliers or Imputing null values, df[Age]: ', df['Age'])
ageMedian = np.nanmedian(df['Age'])
print('ageMedian: ', ageMedian)
df['Age'] = np.where(np.isnan(df['Age']), ageMedian, df['Age'])
print('Before Removing outliers and after Imputing null values, df[Age]: ', df['Age'])

# Calculate 3STD and Mean for Age
ageThreeSD = np.std(df['Age']) * 3
ageMean = np.mean(df['Age'])
ageOutlierThreshold = round(ageThreeSD + ageMean)
print('Age Outlier Threshold: ', ageOutlierThreshold)

# Remove Outliers by replacing all values above Threshold (3STD + Mean) with Threshold Value
df['Age'] = df['Age'].mask(df['Age'] > ageOutlierThreshold, ageOutlierThreshold)
print('After Removing outliers and Imputing null values, df[Age]: ', df['Age'])

# Copy df, keeping only Age column, set type of this df copy to float
df_age_column = pd.DataFrame(df['Age'], columns=['Age'])


# Copy df, keeping only float numeric columns, set type of this df copy to float
df_float_column_names = ['Fare']
print('df_float_column_names: ', df_float_column_names)
df_float_columns = pd.DataFrame(df[df_float_column_names], dtype=np.float, columns=df_float_column_names)

# Concatenate the numeric Data Frames to scale them
print('Before concatenation to df_numeric_columns, df[Age]: ', df['Age'])
print('Before concatenation to df_numeric_columns, df_age_column: ', df_age_column)
df_numeric_columns = pd.concat([df_age_column, df_float_columns], keys=df_numeric_column_names, axis=1)
print('concatenated df_numeric_columns: ', df_numeric_columns)


# Use StandardScaler or MinMaxScaler on Numeric/Non-Categorical columns split
scaler = StandardScaler().fit(df_numeric_columns)
print('scaler.mean_: ', scaler.mean_)
print('scaler.scale: ', scaler.scale_)

df_scaled_numeric_columns =  pd.DataFrame(scaler.transform(df_numeric_columns), columns=df_numeric_column_names)
print('df_scaled_numeric_columns: ', df_scaled_numeric_columns)
# Scaled data should have zero mean and unit variance, check with these prints:
print('df_scaled_numeric_columns.mean(axis=0): ', df_scaled_numeric_columns.mean(axis=0))
print('df_scaled_numeric_columns.std(axis=0)', df_scaled_numeric_columns.std(axis=0))

Before Removing outliers or Imputing null values, df[Age]:  0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64
ageMedian:  28.0
Before Removing outliers and after Imputing null values, df[Age]:  0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64
Age Outlier Threshold:  67
After Removing outliers and Imputing null values, df[Age]:  0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64
df_float_column_names:  ['Fare']
Before concatenation to df_numeric_columns, df[Age]:  0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float6

In [7]:
# Preprocess categorical columns
from sklearn.preprocessing import OneHotEncoder

# BEGIN Create dataframe with categorical columns so that it contains all categorical data that is preprocessed...

# Copy df, keeping only categorical columns, and one-hot encode them
df_categorical_column_names_raw = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']
print('df_categorical_column_names_raw: ', df_categorical_column_names_raw)
df_categorical_columns = pd.DataFrame(df[df_categorical_column_names_raw], dtype=np.str, columns=df_categorical_column_names_raw)
print('df_categorical_columns: ', df_categorical_columns)
encoder = OneHotEncoder(drop='first', handle_unknown='error', sparse=False, ).fit(df_categorical_columns)
print('encoder.categories: ', encoder.categories)
df_encoded_categorical_columns = pd.DataFrame(encoder.transform(df_categorical_columns))
df_encoded_categorical_columns.columns = encoder.get_feature_names(df_categorical_column_names_raw)
print('df_encoded_categorical_columns: ', df_encoded_categorical_columns)
#   By default, the values each feature can take is inferred automatically from the dataset and can be found in the categories_ attribute:


# Combine the numeric DF with the categorical DF
dfs = [df['Survived'], df_scaled_numeric_columns, df_encoded_categorical_columns]
print('Before concatenation to dfTyped, df[\'Survived\']: ', df['Survived'])
print('Before concatenation to dfTyped, df[Age]: ', df['Age'])
print('Before concatenation to dfTyped, df_numeric_columns: ', df_numeric_columns)
print('Before concatenation to dfTyped, df_scaled_numeric_columns: ', df_scaled_numeric_columns)
dfTyped = pd.concat(dfs, axis=1)
print('dfTyped: ', dfTyped)
print('dfTyped[Age]: ', dfTyped['Age'])

        # - With sklearn.preprocessing, preprocess your Dataframes before training model in the Python Script
        #     - [Guide at SciKit Learn site](https://scikit-learn.org/stable/modules/preprocessing.html)
        #     - Use OneHotEncoder
        #     - Use StandardScaler or  MinMaxScaler while you're at it
        #     - Don't worry about any other preprocessing to just get the training working
        #     - Strategy:
        #         - d Split dataframe into Numeric/Non-Categorial and Non-Numeric/Categorial columns
        #             - ! Use StandardScaler or MinMaxScaler on Numeric/Non-Categorical columns split
        #             - d Use OneHotEncoder on Non-Numeric/Categorical columns split

# Initial Data Frame is now preprocessed in dfPreprocessed
dfPreprocessed = dfTyped
print('dfPreprocessed: ', dfPreprocessed)

# Split DataFrame for training now that it is pre-processed
target_column_name = 'Survived'
df_x = dfPreprocessed.drop([target_column_name], axis=1)
df_y = dfPreprocessed.filter([target_column_name], axis=1)
print("See df_x", df_x)
print("See df_y", df_y)
# Register Pandas Dataframe of base df_x and df_y
Dataset.Tabular.register_pandas_dataframe(df_x, datastore, "Titanic Feature Column Data for train_test_split usage")
Dataset.Tabular.register_pandas_dataframe(df_y, datastore, "Titanic Target Column Data for train_test_split usage")

df_categorical_column_names_raw:  ['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']
df_categorical_columns:      Pclass     Sex SibSp Parch Cabin Embarked
0        3    male     1     0  None        S
1        1  female     1     0   C85        C
2        3  female     0     0  None        S
3        1  female     1     0  C123        S
4        3    male     0     0  None        S
..     ...     ...   ...   ...   ...      ...
886      2    male     0     0  None        S
887      1  female     0     0   B42        S
888      3  female     1     2  None        S
889      1    male     0     0  C148        C
890      3    male     0     0  None        Q

[891 rows x 6 columns]
encoder.categories:  auto
df_encoded_categorical_columns:       Pclass_2  Pclass_3  Sex_male  SibSp_1  SibSp_2  SibSp_3  SibSp_4  \
0         0.0       1.0       1.0      1.0      0.0      0.0      0.0   
1         0.0       0.0       0.0      1.0      0.0      0.0      0.0   
2         0.0       1.0       

{
  "source": [
    "('workspaceblobstore', 'managed-dataset/94cc0d4e-c3a2-4c4c-9a77-f5e43b6e20c4/')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ],
  "registration": {
    "id": "59c69242-1276-4d46-be9d-ce7ce0106840",
    "name": "Titanic Target Column Data for train_test_split usage",
    "version": 9,
    "workspace": "Workspace.create(name='automlbook', subscription_id='4d278f3d-b4fd-4fa2-86b6-d34b96bc888f', resource_group='Foxy_Resources')"
  }
}

In [18]:
# Save feature names to pass to Experiment runner
features=[*df_numeric_column_names, *df_encoded_categorical_columns.columns]
# Encode features (names) list into a string like '["a","b"]'
for x in range(len(features)):
        features[x] = '"{}"'.format(features[x])
featuresEncoded = "[{}]".format(",".join(features))
print(featuresEncoded)

["Age","Fare","Pclass_2","Pclass_3","Sex_male","SibSp_1","SibSp_2","SibSp_3","SibSp_4","SibSp_5","SibSp_8","Parch_1","Parch_2","Parch_3","Parch_4","Parch_5","Parch_6","Cabin_A14","Cabin_A16","Cabin_A19","Cabin_A20","Cabin_A23","Cabin_A24","Cabin_A26","Cabin_A31","Cabin_A32","Cabin_A34","Cabin_A36","Cabin_A5","Cabin_A6","Cabin_A7","Cabin_B101","Cabin_B102","Cabin_B18","Cabin_B19","Cabin_B20","Cabin_B22","Cabin_B28","Cabin_B3","Cabin_B30","Cabin_B35","Cabin_B37","Cabin_B38","Cabin_B39","Cabin_B4","Cabin_B41","Cabin_B42","Cabin_B49","Cabin_B5","Cabin_B50","Cabin_B51 B53 B55","Cabin_B57 B59 B63 B66","Cabin_B58 B60","Cabin_B69","Cabin_B71","Cabin_B73","Cabin_B77","Cabin_B78","Cabin_B79","Cabin_B80","Cabin_B82 B84","Cabin_B86","Cabin_B94","Cabin_B96 B98","Cabin_C101","Cabin_C103","Cabin_C104","Cabin_C106","Cabin_C110","Cabin_C111","Cabin_C118","Cabin_C123","Cabin_C124","Cabin_C125","Cabin_C126","Cabin_C128","Cabin_C148","Cabin_C2","Cabin_C22 C26","Cabin_C23 C25 C27","Cabin_C30","Cabin_C32","

In [14]:
# For splitting of data into train and test set
from sklearn.model_selection import train_test_split

# Split the data
# What you need to pass to train_test_split...
# ... I need X and Y dataframe, X just with target missing, Y just with target column present
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y,
                                                    test_size=0.2,
                                                    random_state=0)

# Register the splits
Dataset.Tabular.register_pandas_dataframe(X_train, datastore, "Titanic Feature Column Data for training")
Dataset.Tabular.register_pandas_dataframe(X_test, datastore, "Titanic Feature Column Data for testing")
Dataset.Tabular.register_pandas_dataframe(y_train, datastore, "Titanic Target Column Data for training")
Dataset.Tabular.register_pandas_dataframe(y_test, datastore, "Titanic Target Column Data for testing")

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/34aac384-a90b-4dfd-8b2c-792ab20a6e91/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/2412552b-1e7c-44f0-a57a-91285a8764b0/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/faddff58-c492-45b9-820a-67ce17ead3bf/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploadin

{
  "source": [
    "('workspaceblobstore', 'managed-dataset/9d17bd8d-f8c1-4a05-85ff-5b406f02f9df/')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ],
  "registration": {
    "id": "83ec94c4-eaa4-4c80-882f-bc443ee7ebce",
    "name": "Titanic Target Column Data for testing",
    "version": 1,
    "workspace": "Workspace.create(name='automlbook', subscription_id='4d278f3d-b4fd-4fa2-86b6-d34b96bc888f', resource_group='Foxy_Resources')"
  }
}

In [23]:
# Run Training Experiment locally
from azureml.core import ScriptRunConfig
import datetime

# Experiment
experiment_name = 'Local_Training_AutoML'
experiment = Experiment(workspace=ws, name=experiment_name)

# Define Compute Cluster to use
compute_target = 'local'
source_directory = './scripts'
script_name = 'localTrainingAutoML.py'
dataset_name = 'automlbook Titanic Training Data A'
# set output file name like 'DecisionTreeClassifier_Titanic_local-2022-04-17 21:40:36.114550.pkl'
suffix = 'local-' + str(datetime.datetime.now())
suffix = suffix.replace(' ', '_') # Clean up datetimestamp
suffix = suffix.replace(':', '-') 
out_model_file_name = 'DecisionTreeClassifier_Titanic_{}.pkl'.format(suffix)
# set output file name like 'DecisionTreeClassifier_Titanic_local-2022-04-17 21:40:36.114550.pkl'

script_arguments = [
"--tenant-id", tenant_id,
"--ws-name", ws_name,
"--subscription-id", subscription_id,
"--resource-group", resource_group,
"--datastore-name", datastore_name,
"--dataset-name", dataset_name,
"--out-model-file-name", out_model_file_name,
"--features", featuresEncoded
]
scriptRunConfig = ScriptRunConfig(
        source_directory=source_directory,
        script=script_name,
        arguments=script_arguments,
        environment=user_managed_env,
        compute_target=compute_target)
        
AutoML_run = experiment.submit(scriptRunConfig)
RunDetails(AutoML_run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [24]:
# Register Model from the AutoML_run
description = "Best Local AutoML Regression Run using Titanic Sample Data."
tags = {
  "project" : "Local Training AutoML", 
  "creator": "fox", 
  "task": "classification", 
  "dataset": "automlbook Titanic Training Data A", 
  "metric": "normalized_root_mean_squared_error"
}

# TODO do sleep while until the model is known to be successfully trained (error out if training fails)
AutoML_run.register_model(model_path='./outputs', model_name=out_model_file_name, description=description, tags=tags)
# set output file name like 'DecisionTreeClassifier_Titanic_local-2022-04-17 21:40:36.114550.pkl'

Model(workspace=Workspace.create(name='automlbook', subscription_id='4d278f3d-b4fd-4fa2-86b6-d34b96bc888f', resource_group='Foxy_Resources'), name=DecisionTreeClassifier_Titanic_local-2022-04-26_20-31-08.456750.pkl, id=DecisionTreeClassifier_Titanic_local-2022-04-26_20-31-08.456750.pkl:1, version=1, tags={'project': 'Local Training AutoML', 'creator': 'fox', 'task': 'classification', 'dataset': 'automlbook Titanic Training Data A', 'metric': 'normalized_root_mean_squared_error'}, properties={})

In [25]:
# Explain the Model with downloaded Explanation
from azureml.interpret import ExplanationClient

client = ExplanationClient.from_run(AutoML_run)

# get model explanation data
global_explanation = client.download_model_explanation()
# or only get the top k (e.g., 4) most important features with their importance values
# explanation = client.download_model_explanation(top_k=4)

global_importance_values = global_explanation.get_ranked_global_values()
global_importance_names = global_explanation.get_ranked_global_names()
print('global importance values: {}'.format(global_importance_values))
print('global importance names: {}'.format(global_importance_names))

global importance values: [0.21716696962018667, 0.1122582402577581, 0.10402100123798161, 0.08865683377188135, 0.05252211410713198, 0.0368943305313187, 0.017877014818090897, 0.008488581448398595, 0.005253256140524354, 0.004046200679958525, 0.0030354489514662747, 0.0029555637147700396, 0.002769917734764271, 0.002769516816238238, 0.0021218373738156513, 0.0020144018783986267, 0.0018403819456108638, 0.0016807329267279347, 0.0015897761847816186, 0.001546584427046002, 0.0015122514882468185, 0.0014419374242391017, 0.0014322760631419076, 0.001412008703938046, 0.001401106052814717, 0.0013778725421999022, 0.001333707894215486, 0.0013192590724541058, 0.0012654295115522125, 0.001247837198498998, 0.0012158210732305928, 0.0011723582449620117, 0.0011274257449647882, 0.001087879514207509, 0.0010769205142123935, 0.0010510138139361498, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

In [29]:
# Visualize explanations
from raiwidgets import ExplanationDashboard
from azureml.core.model import Model

model = Model(ws, out_model_file_name)
print('Name:', model.name)
print('Version:', model.version)

# TODO do something with model so that it has a predict method
# See this for creating a wrapper that would be used if you were using Apache Spark...
# 
# class wrapper(object):
#   def __init__(self, model):
#     self.model = model
  
#   def predict(self, data):
#     sparkdata = spark.createDataFrame(data)
#     return model.transform(sparkdata).select('prediction').toPandas().values.flatten().tolist()
  
#   def predict_proba(self, data):
#     sparkdata = spark.createDataFrame(data)
#     prediction = model.transform(sparkdata).select('probability').toPandas().values.flatten().tolist()
#     proba_list = [vector.values.tolist() for vector in prediction]
#     return proba_list

ExplanationDashboard(global_explanation, model, dataset=X_test)

Name: DecisionTreeClassifier_Titanic_local-2022-04-26_20-31-08.456750.pkl
Version: 1


ValueError: Model does not support predict method for given