In [None]:
import azureml.core
from azureml.core import Workspace, Dataset, Datastore, Environment
from azureml.core import Experiment
import azureml.interpret
from azureml.widgets import RunDetails

import pandas as pd
import numpy as np

print("SDK version:", azureml.core.VERSION)

In [None]:
# Set (System Managed) Local Environment up
from azureml.core.conda_dependencies import CondaDependencies

system_managed_env = Environment("system-managed-env")
# Editing a run configuration property on-fly.
system_managed_env.python.user_managed_dependencies = False

# Specify conda dependencies with scikit-learn
conda_packages = ['pip',
                  'pyspark',
                  'scikit-learn'
                 ]
pip_packages =   ['azureml.interpret',
                  'azureml-dataset-runtime',
                  'jinja2',
                  'MarkupSafe',
                  'raiwidgets'
                 ]
                 
condaDependencies = CondaDependencies.create(conda_packages=conda_packages, pip_packages=pip_packages)
system_managed_env.python.conda_dependencies = condaDependencies

In [None]:
# Get the Workspace object from Azure
from azureml.core.authentication import InteractiveLoginAuthentication

# You can find tenant id under azure active directory->properties
tenant_id = '198c7d8c-e010-45ce-a018-ec2d9a33f58f'
ia = InteractiveLoginAuthentication(tenant_id=tenant_id)
ws_name = 'automlbook'
subscription_id = '4d278f3d-b4fd-4fa2-86b6-d34b96bc888f'
resource_group = 'Foxy_Resources'
ws = Workspace.get(name=ws_name,
                   subscription_id=subscription_id,
                   resource_group=resource_group,
                   auth=ia)
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

In [None]:
# Create datastore, try getting datastore via Workspace object
datastore = Datastore.get_default(ws)
datastore_name = 'workspaceblobstore'
datastore = Datastore.get(ws, datastore_name)

In [None]:
# Create a dataset from the datastore of the Workspace
dataset_name = 'automlbook Titanic Training Data A'
# dataset = Dataset.get_by_name(ws, dataset_name)
dataset = Dataset.get_by_name(ws, dataset_name, version = 'latest')
dataset_columns = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']

# Show a sample of the data in the dataset
dataset.take(10).to_pandas_dataframe()

# Turn Dataset into Pandas Dataframe, it is to be preprocessed
df = dataset.to_pandas_dataframe()

In [None]:
# (First way I did this, commented out) Preprocess numeric columns

# from sklearn.preprocessing import StandardScaler

# df_column_names = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']
# df_numeric_column_names = ['Age', 'Fare']

# # BEGIN Create dataframe with numeric columns so that it contains all numbers that are preprocessed...

# # For int column Age, Impute NaN numeric values, and Remove outliers
# print('Before Removing outliers or Imputing null values, df[Age]: ', df['Age'])
# ageMedian = np.nanmedian(df['Age'])
# print('ageMedian: ', ageMedian)
# df['Age'] = np.where(np.isnan(df['Age']), ageMedian, df['Age'])
# print('Before Removing outliers and after Imputing null values, df[Age]: ', df['Age'])

# # Calculate 3STD and Mean for Age
# ageThreeSD = np.std(df['Age']) * 3
# ageMean = np.mean(df['Age'])
# ageOutlierThreshold = round(ageThreeSD + ageMean)
# print('Age Outlier Threshold: ', ageOutlierThreshold)

# # Remove Outliers by replacing all values above Threshold (3STD + Mean) with Threshold Value
# df['Age'] = df['Age'].mask(df['Age'] > ageOutlierThreshold, ageOutlierThreshold)
# print('After Removing outliers and Imputing null values, df[Age]: ', df['Age'])

# # Copy df, keeping only Age column, set type of this df copy to float
# df_age_column = pd.DataFrame(df['Age'], columns=['Age'])


# # Copy df, keeping only float numeric columns, set type of this df copy to float
# df_float_column_names = ['Fare']
# print('df_float_column_names: ', df_float_column_names)
# df_float_columns = pd.DataFrame(df[df_float_column_names], dtype=np.float, columns=df_float_column_names)

# # Concatenate the numeric Data Frames to scale them
# print('Before concatenation to df_numeric_columns, df[Age]: ', df['Age'])
# print('Before concatenation to df_numeric_columns, df_age_column: ', df_age_column)
# df_numeric_columns = pd.concat([df_age_column, df_float_columns], keys=df_numeric_column_names, axis=1)
# print('concatenated df_numeric_columns: ', df_numeric_columns)


# # Use StandardScaler or MinMaxScaler on Numeric/Non-Categorical columns split
# scaler = StandardScaler().fit(df_numeric_columns)
# print('scaler.mean_: ', scaler.mean_)
# print('scaler.scale: ', scaler.scale_)

# df_scaled_numeric_columns =  pd.DataFrame(scaler.transform(df_numeric_columns), columns=df_numeric_column_names)
# print('df_scaled_numeric_columns: ', df_scaled_numeric_columns)
# # Scaled data should have zero mean and unit variance, check with these prints:
# print('df_scaled_numeric_columns.mean(axis=0): ', df_scaled_numeric_columns.mean(axis=0))
# print('df_scaled_numeric_columns.std(axis=0)', df_scaled_numeric_columns.std(axis=0))

In [None]:
# (First way I did this, commented out) Preprocess categorical columns

# from sklearn.preprocessing import OneHotEncoder

# # BEGIN Create dataframe with categorical columns so that it contains all categorical data that is preprocessed...

# # Copy df, keeping only categorical columns, and one-hot encode them
# df_categorical_column_names_raw = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']
# # print('df_categorical_column_names_raw: ', df_categorical_column_names_raw)
# df_categorical_columns = pd.DataFrame(df[df_categorical_column_names_raw], dtype=np.str, columns=df_categorical_column_names_raw)
# # print('df_categorical_columns: ', df_categorical_columns)
# encoder = OneHotEncoder(drop='first', handle_unknown='error', sparse=False, ).fit(df_categorical_columns)
# # print('encoder.categories: ', encoder.categories)
# df_encoded_categorical_columns = pd.DataFrame(encoder.transform(df_categorical_columns))
# df_encoded_categorical_columns.columns = encoder.get_feature_names(df_categorical_column_names_raw)
# # print('df_encoded_categorical_columns: ', df_encoded_categorical_columns)
# #   By default, the values each feature can take is inferred automatically from the dataset and can be found in the categories_ attribute:


# # Combine the numeric DF with the categorical DF
# dfs = [df['Survived'], df_scaled_numeric_columns, df_encoded_categorical_columns]
# # print('Before concatenation to dfTyped, df[\'Survived\']: ', df['Survived'])
# # print('Before concatenation to dfTyped, df[Age]: ', df['Age'])
# # print('Before concatenation to dfTyped, df_numeric_columns: ', df_numeric_columns)
# # print('Before concatenation to dfTyped, df_scaled_numeric_columns: ', df_scaled_numeric_columns)
# TODO maybe, shouldn't column names be passed in to the keys list here?
# dfTyped = pd.concat(dfs, axis=1, keys=['Survived', *df_scaled_numeric_columns, *df_encoded_categorical_columns])
# # print('dfTyped: ', dfTyped)
# # print('dfTyped[Age]: ', dfTyped['Age'])

#         # - With sklearn.preprocessing, preprocess your Dataframes before training model in the Python Script
#         #     - [Guide at SciKit Learn site](https://scikit-learn.org/stable/modules/preprocessing.html)
#         #     - Use OneHotEncoder
#         #     - Use StandardScaler or  MinMaxScaler while you're at it
#         #     - Don't worry about any other preprocessing to just get the training working
#         #     - Strategy:
#         #         - d Split dataframe into Numeric/Non-Categorial and Non-Numeric/Categorial columns
#         #             - ! Use StandardScaler or MinMaxScaler on Numeric/Non-Categorical columns split
#         #             - d Use OneHotEncoder on Non-Numeric/Categorical columns split

# # Initial Data Frame is now preprocessed in dfPreprocessed
# dfPreprocessed = dfTyped
# # print('dfPreprocessed: ', dfPreprocessed)

# # Split DataFrame for training now that it is pre-processed
# target_column_name = 'Survived'
# df_x = dfPreprocessed.drop([target_column_name], axis=1)
# df_y = dfPreprocessed[target_column_name]
# # print("See df_x", df_x)
# print("See df_y", df_y)

In [None]:
# Partition Dataframe to get one for Numeric columns and one for Categorical columns

df_column_names = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']

##### BEGIN Create dataframe with numeric columns so that it contains all numbers that are preprocessed...
df_numeric_column_names = ['Age', 'Fare']
# Copy df, keeping only float numeric columns, set type of this df copy to float
df_float_column_names = ['Fare']
df_float_columns = pd.DataFrame(df[df_float_column_names], dtype=np.float, columns=df_float_column_names)
# # Copy df, keeping only integer Age column to leave as an integer
df_integer_column = pd.DataFrame(df['Age'], columns=['Age'])
# Concatenate the numeric DataFrames
df_numeric_columns = pd.concat([df_integer_column, df_float_columns], axis=1)


##### BEGIN Create dataframe with categorical columns so that it contains all categorical data that is preprocessed...
df_categorical_column_names = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']
# Copy df, keeping only categorical columns
df_categorical_columns = pd.DataFrame(df[df_categorical_column_names], dtype=np.str, columns=df_categorical_column_names)

print('concatenated df_numeric_columns: ', df_numeric_columns)
print('df_categorical_columns: ', df_categorical_columns)

In [None]:
# TODO reorder things to get Data Frame as it is after transformation with the ColumnTransformer preprocessor

# Create preprocessor to preprocess numeric and categorical columns (with Transfomer API via ColumnTransformer, including creation of an Explainer)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', one_hot_encoder)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, df_numeric_column_names),
        ('cat', categorical_transformer, df_categorical_column_names)])

In [None]:
# Combine the numeric DF with the categorical DF
# print("df['Survived'] is ", df['Survived'])
# print("df_numeric_columns is ", df_numeric_columns)
# print("df_numeric_columns.columns is ", df_numeric_columns.columns)
# print("df_categorical_columns is ", df_categorical_columns)
# print("df_categorical_columns.columns is ", df_categorical_columns.columns)

# Concatenate dfs to get DataFrame of all columns to submit to the classifier_pipeline
dfs = [df['Survived'], df_numeric_columns, df_categorical_columns]
# print("dfs is" + str(dfs))
# print('Before concatenation to dfTyped, df[\'Survived\']: ', df['Survived'])
# print('Before concatenation to dfTyped, df_numeric_columns: ', df_numeric_columns)
dfTyped = pd.concat(dfs, axis=1)
print('dfTyped: ', dfTyped)

In [None]:
# Split pre-transformation Data Frame into feature/target columns
target_column_name = 'Survived'
df_x_pre_transformation = dfTyped.drop([target_column_name], axis=1)
df_y_pre_transformation = dfTyped[target_column_name].ravel()
print("See df_x_pre_transformation", df_x_pre_transformation)
print("See df_y_pre_transformation", df_y_pre_transformation)

In [None]:
# Transfom Data Frame and get new One Hot Encoded column names

# Get the preprocessed Data Frame columns in a list
# print(str(preprocessor))
# print(str(preprocessor.transformers_[1][1]\
   # .named_steps['onehot']))
one_hot_encoder.fit(df_categorical_columns)
df_encoded_categorical_column_names = one_hot_encoder.get_feature_names(df_categorical_column_names)
# print(str(df_encoded_categorical_column_names))

# TODO (dfProcessed doesn't look right...) Get the preprocessed Data Frame
dfPreprocessed = preprocessor.fit_transform(df_x_pre_transformation, df_y_pre_transformation)
print(dfPreprocessed)

In [None]:
# Append classifier to preprocessing pipeline (Then we have a full prediction pipeline)
from sklearn.linear_model import LogisticRegression

regressor = LogisticRegression(solver='lbfgs')
classifier_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', regressor)])
# Now we have a full prediction pipeline.

In [None]:
# Split DataFrame for training now that it is pre-processed
# print('dfPreprocessed: ', dfPreprocessed)

# Split Data Frame into feature/target columns
target_column_name = 'Survived'
df_x = dfPreprocessed.drop([target_column_name], axis=1)
df_y = dfPreprocessed[target_column_name]
# print("See df_x", df_x)
print("See df_y", df_y)

In [None]:
# Register Pandas Dataframe of base df_x and df_y
Dataset.Tabular.register_pandas_dataframe(df_x, datastore, "Titanic Feature Column Data for train_test_split usage (LocalConda notebook)")
Dataset.Tabular.register_pandas_dataframe(df_y, datastore, "Titanic Target Column Data for train_test_split usage (LocalConda notebook)")

In [None]:
# Split data into training and test data, register the resulting Datasets with Azure
from sklearn.model_selection import train_test_split

# Split the data
# What you need to pass to train_test_split...
# ... I need X and Y dataframe, X just with target missing, Y just with target column present
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y,
                                                    test_size=0.2,
                                                    random_state=0)
# print("See y_test", y_test)
print("See y_test.columns.tolist()", str(y_test.columns.tolist()))
# print("See y_test.values.tolist() to pass as true_ys to ExplanationClient.upload_model_explanation()", y_test.values.tolist())
# print("See y_test.values.ravel() to pass as true_ys to ExplanationClient.upload_model_explanation()", y_test.values.ravel())
# print("See y_test.values.tolist().flatten() to pass as true_ys to ExplanationClient.upload_model_explanation()", y_test.values.flatten())

# Register the splits
Dataset.Tabular.register_pandas_dataframe(X_train, datastore, "Titanic Feature Column Data for training (LocalConda notebook)")
Dataset.Tabular.register_pandas_dataframe(X_test, datastore, "Titanic Feature Column Data for testing (LocalConda notebook)")
Dataset.Tabular.register_pandas_dataframe(y_train, datastore, "Titanic Target Column Data for training (LocalConda notebook)")
Dataset.Tabular.register_pandas_dataframe(y_test, datastore, "Titanic Target Column Data for testing (LocalConda notebook)")

In [None]:
# Save feature names
features=[*df_numeric_column_names, *df_encoded_categorical_column_names]

# classifier_pipeline.steps[-1][1] returns the trained classification model
# pass transformation as an input to create the explanation object
# "features" and "classes" fields are optional
explainer = TabularExplainer(classifier_pipeline.steps[-1][1],
                                     initialization_examples=X_train,
                                     features=features,
                                     transformations=preprocessor)

In [None]:
# Explain results with Explainer and upload the explanation

# BEGIN Get Global Explanations, global as in 'of total data'...

# You can use the training data or the test data here, but test data would allow you to use Explanation Exploration
# print("X_test, line value before explainer.explain_global: \n" + str(X_test))
global_explanation = explainer.explain_global(X_test, y_test)
# If you used the PFIExplainer in the previous step, use the next line of code instead
# global_explanation = explainer.explain_global(x_train, true_labels=y_train)
# Sorted feature importance values and feature names
sorted_global_importance_values = global_explanation.get_ranked_global_values()
sorted_global_importance_names = global_explanation.get_ranked_global_names()
globalFeatureExplanations = dict(zip(sorted_global_importance_names, sorted_global_importance_values))
print('globalFeatureExplanations: ', globalFeatureExplanations)
# Alternatively, you can print out a dictionary that holds the top K feature names and values
print('global_explanation.get_feature_importance_dict(): ', global_explanation.get_feature_importance_dict())

# BEGIN Get local explanations of individual predictions
# Get explanation for the first few data points in the test set
# local_explanation = explainer.explain_local(X_test[0:5])
# Sorted feature importance values and feature names
# sorted_local_importance_names = local_explanation.get_ranked_local_names()
# print('sorted_local_importance_names: ', sorted_local_importance_names)
# print('len(sorted_local_importance_names): ', len(sorted_local_importance_names))
# sorted_local_importance_values = local_explanation.get_ranked_local_values()
# print('sorted_local_importance_values: ', sorted_local_importance_values)
# print('len(sorted_local_importance_values): ', len(sorted_local_importance_values)) 
# 
# THIS DOES NOT WORK LIKE IT DOES WITH THE GLOBAL_EXPLANATION, HOWEVER!
# client.upload_model_explanation(sorted_local_importance_values, comment='local explanation for data points 0-5: all features')
#
# END Get local explanations of individual predictions

In [None]:

# Encode features (names) list into a string like '["a","b"]'
for x in range(len(features)):
        features[x] = '"{}"'.format(features[x])
featuresEncoded = "[{}]".format(",".join(features))
print(featuresEncoded)

In [None]:
# Run Training Experiment locally
# TODO use classifier_pipeline, perhaps move that code to create and use the Pipeline into the python script
# TODO (Consider what minimally needs to be in notebook vs script)
from azureml.core import ScriptRunConfig
import datetime

# Experiment
experiment_name = 'LocalConda_Training_AutoML'
experiment = Experiment(workspace=ws, name=experiment_name)

# Define Compute Cluster to use
compute_target = 'local'
source_directory = './scripts'
script_name = 'localCondaTrainingAutoML.py'
dataset_name = 'automlbook Titanic Training Data A'
# set output file name like 'DecisionTreeClassifier_Titanic_LocalConda-2022-04-17 21:40:36.114550.pkl'
suffix = 'local-' + str(datetime.datetime.now())
suffix = suffix.replace(' ', '_') # Clean up datetimestamp
suffix = suffix.replace(':', '-') 
out_model_file_name = 'DecisionTreeClassifier_Titanic_LocalConda_{}.pkl'.format(suffix)
# set output file name like 'DecisionTreeClassifier_Titanic_LocalConda-2022-04-17 21:40:36.114550.pkl'

script_arguments = [
"--tenant-id", tenant_id,
"--ws-name", ws_name,
"--subscription-id", subscription_id,
"--resource-group", resource_group,
"--datastore-name", datastore_name,
"--out-model-file-name", out_model_file_name,
"--features", featuresEncoded
]
scriptRunConfig = ScriptRunConfig(
        source_directory=source_directory,
        script=script_name,
        arguments=script_arguments,
        environment=system_managed_env,
        compute_target=compute_target)

import os
os.environ["PATH"] = "/home/johna/miniconda3/bin:" + os.environ["PATH"]
AutoML_run = experiment.submit(scriptRunConfig)
RunDetails(AutoML_run).show()

In [None]:
# Upload global model explanation data...
# The explanation can then be downloaded on any compute
# Multiple explanations can be uploaded
print("y_test value the line before client.upload_model_explanation(): \n" + str(y_test))
print("y_test.values.ravel() value passed as true_ys to client.upload_model_explanation(): \n" + str(y_test.values.ravel()))
client = ExplanationClient.from_run(AutoML_run)
client.upload_model_explanation(global_explanation, true_ys=y_test.values.ravel(), comment='global explanation: all features')

# Or you can only upload the explanation object with the top k feature info with this...
# client.upload_model_explanation(global_explanation, top_k=2, comment='global explanation: Only top 2 features')
# END Upload global model explanation data...

In [None]:
# Register Model from the AutoML_run
description = "Best LocalConda AutoML Regression Run using Titanic Sample Data."
tags = {
  "project" : "Local Training AutoML", 
  "creator": "fox", 
  "task": "classification", 
  "dataset": "automlbook Titanic Training Data A", 
  "metric": "normalized_root_mean_squared_error"
}

# Attempt to register model once output model file is available
from azureml.core import Model
import sklearn
import time
while True:
  try:
    AutoML_run.register_model(model_path='./outputs', model_name=out_model_file_name, description=description, tags=tags,
                            model_framework=Model.Framework.SCIKITLEARN, # Framework used to create the model.
                            model_framework_version=sklearn.__version__)  # Version of scikit-learn used to create the model.)
    break
  except:
      print ("encountered exception registering model output file, waiting and trying again...") 
      time.sleep(60)
# Set output file name like 'DecisionTreeClassifier_Titanic_LocalConda-2022-04-17 21:40:36.114550.pkl'

In [None]:
# For the Experiment Run that was ran, Get Global Explanations (Downloaded with the Experiment Run object!)
from azureml.interpret import ExplanationClient

client = ExplanationClient.from_run(AutoML_run)

# get model explanation data
global_explanation = client.download_model_explanation()
# or only get the top k (e.g., 4) most important features with their importance values
# explanation = client.download_model_explanation(top_k=4)

global_importance_values = global_explanation.get_ranked_global_values()
global_importance_names = global_explanation.get_ranked_global_names()
print('global importance values: {}'.format(global_importance_values))
print('global importance names: {}'.format(global_importance_names))

In [None]:
# Download the Model from AzureML and Visualize Explanations with it
from raiwidgets import ExplanationDashboard
from azureml.core.model import Model
import joblib
import jinja2
# from jinja2 import escape
# from MarkupSafe import escape
# print(version(jinja2))

# print('Model.list(ws)', Model.list(ws))

# Download the Model from Azure

# Try 1: just Model constructor and joblib.load()
# downloaded_model = Model(ws, out_model_file_name)
# joblib.load(downloaded_model)

# Try 2: Use Model.get_model_path and joblib.load()
# remote_model_path = Model.get_model_path(out_model_file_name, _workspace=ws)
# downloaded_model = joblib.load(remote_model_path)

# Try 3: Use Model.download and joblib.load()
remote_model_obj = Model(ws, out_model_file_name)
print('Name:', remote_model_obj.name)
print('Version:', remote_model_obj.version)
remote_model_path = remote_model_obj.download(exist_ok = True)
downloaded_model = joblib.load(remote_model_path)

# BEGIN Access "Local Explanations"
# (Local Explanation meaning "of individual predictions") 
from interpret.ext.blackbox import TabularExplainer
# "features" and "classes" fields are optional
explainer = TabularExplainer(downloaded_model,
                             X_train,
                             features=features)

# Get explanation for the first few data points in the test set
local_explanation = explainer.explain_local(X_test[0:5])
# Sorted feature importance values and feature names
sorted_local_importance_names = local_explanation.get_ranked_local_names()
# print('sorted_local_importance_names: ', sorted_local_importance_names)
print('len(sorted_local_importance_names): ', len(sorted_local_importance_names))
sorted_local_importance_values = local_explanation.get_ranked_local_values()
# print('sorted_local_importance_values: ', sorted_local_importance_values)
print('len(sorted_local_importance_values): ', len(sorted_local_importance_values))
# END Access "Local Explanations"

# TODO get local explanation uploaded, downloaded, and visualized as well...

# Visualize explanations
# Be sure to pass dataset=(test feature columns Dataframe) and true_y=(test predicted column Dataframe)
ExplanationDashboard(global_explanation, downloaded_model, dataset=X_test, true_y=y_test)