In [None]:
import azureml.core
from azureml.core import Workspace, Dataset, Datastore
from azureml.core import Experiment
from azureml.widgets import RunDetails

import pandas as pd
import numpy as np

print("SDK version:", azureml.core.VERSION)

In [None]:
# Set (User Managed) Local Environment up
from azureml.core import Environment

# Editing a run configuration property on-fly.
user_managed_env = Environment("user-managed-env")

user_managed_env.python.user_managed_dependencies = True

# You can choose a specific Python environment by pointing to a Python path 
#user_managed_env.python.interpreter_path = '/home/johndoe/miniconda3/envs/myenv/bin/python'

In [None]:
from azureml.core.authentication import InteractiveLoginAuthentication

# Get the Workspace object from Azure
# You can find tenant id under azure active directory->properties
tenant_id = '198c7d8c-e010-45ce-a018-ec2d9a33f58f'
ia = InteractiveLoginAuthentication(tenant_id=tenant_id)
ws_name = 'automlbook'
subscription_id = '4d278f3d-b4fd-4fa2-86b6-d34b96bc888f'
resource_group = 'Foxy_Resources'
ws = Workspace.get(name=ws_name,
                   subscription_id=subscription_id,
                   resource_group=resource_group,
                   auth=ia)
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

In [None]:
# Create datastore, try getting datastore via Workspace object
datastore = Datastore.get_default(ws)
datastore_name = 'workspaceblobstore'
datastore = Datastore.get(ws, datastore_name)

In [None]:
# Create a dataset from the datastore of the Workspace
dataset_name = 'automlbook Titanic Training Data A'
# dataset = Dataset.get_by_name(ws, dataset_name)
dataset = Dataset.get_by_name(ws, dataset_name, version = 'latest')
dataset_columns = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']

# Show a sample of the data in the dataset
dataset.take(10).to_pandas_dataframe()

# Turn Dataset into Pandas Dataframe, it is to be preprocessed
df = dataset.to_pandas_dataframe()

In [None]:
# Preprocess numeric columns
from sklearn.preprocessing import StandardScaler

df_column_names = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']
df_numeric_column_names = ['Age', 'Fare']

# BEGIN Create dataframe with numeric columns so that it contains all numbers that are preprocessed...

# For int column Age, Impute NaN numeric values, and Remove outliers
print('Before Removing outliers or Imputing null values, df[Age]: ', df['Age'])
ageMedian = np.nanmedian(df['Age'])
print('ageMedian: ', ageMedian)
df['Age'] = np.where(np.isnan(df['Age']), ageMedian, df['Age'])
print('Before Removing outliers and after Imputing null values, df[Age]: ', df['Age'])

# Calculate 3STD and Mean for Age
ageThreeSD = np.std(df['Age']) * 3
ageMean = np.mean(df['Age'])
ageOutlierThreshold = round(ageThreeSD + ageMean)
print('Age Outlier Threshold: ', ageOutlierThreshold)

# Remove Outliers by replacing all values above Threshold (3STD + Mean) with Threshold Value
df['Age'] = df['Age'].mask(df['Age'] > ageOutlierThreshold, ageOutlierThreshold)
print('After Removing outliers and Imputing null values, df[Age]: ', df['Age'])

# Copy df, keeping only Age column, set type of this df copy to float
df_age_column = pd.DataFrame(df['Age'], columns=['Age'])


# Copy df, keeping only float numeric columns, set type of this df copy to float
df_float_column_names = ['Fare']
print('df_float_column_names: ', df_float_column_names)
df_float_columns = pd.DataFrame(df[df_float_column_names], dtype=np.float, columns=df_float_column_names)

# Concatenate the numeric Data Frames to scale them
print('Before concatenation to df_numeric_columns, df[Age]: ', df['Age'])
print('Before concatenation to df_numeric_columns, df_age_column: ', df_age_column)
df_numeric_columns = pd.concat([df_age_column, df_float_columns], keys=df_numeric_column_names, axis=1)
print('concatenated df_numeric_columns: ', df_numeric_columns)


# Use StandardScaler or MinMaxScaler on Numeric/Non-Categorical columns split
scaler = StandardScaler().fit(df_numeric_columns)
print('scaler.mean_: ', scaler.mean_)
print('scaler.scale: ', scaler.scale_)

df_scaled_numeric_columns =  pd.DataFrame(scaler.transform(df_numeric_columns), columns=df_numeric_column_names)
print('df_scaled_numeric_columns: ', df_scaled_numeric_columns)
# Scaled data should have zero mean and unit variance, check with these prints:
print('df_scaled_numeric_columns.mean(axis=0): ', df_scaled_numeric_columns.mean(axis=0))
print('df_scaled_numeric_columns.std(axis=0)', df_scaled_numeric_columns.std(axis=0))

In [None]:
# Preprocess categorical columns
from sklearn.preprocessing import OneHotEncoder

# BEGIN Create dataframe with categorical columns so that it contains all categorical data that is preprocessed...

# Copy df, keeping only categorical columns, and one-hot encode them
df_categorical_column_names_raw = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']
print('df_categorical_column_names_raw: ', df_categorical_column_names_raw)
df_categorical_columns = pd.DataFrame(df[df_categorical_column_names_raw], dtype=np.str, columns=df_categorical_column_names_raw)
print('df_categorical_columns: ', df_categorical_columns)
encoder = OneHotEncoder(drop='first', handle_unknown='error', sparse=False, ).fit(df_categorical_columns)
print('encoder.categories: ', encoder.categories)
df_encoded_categorical_columns = pd.DataFrame(encoder.transform(df_categorical_columns))
df_encoded_categorical_columns.columns = encoder.get_feature_names(df_categorical_column_names_raw)
print('df_encoded_categorical_columns: ', df_encoded_categorical_columns)
#   By default, the values each feature can take is inferred automatically from the dataset and can be found in the categories_ attribute:


# Combine the numeric DF with the categorical DF
dfs = [df['Survived'], df_scaled_numeric_columns, df_encoded_categorical_columns]
print('Before concatenation to dfTyped, df[\'Survived\']: ', df['Survived'])
print('Before concatenation to dfTyped, df[Age]: ', df['Age'])
print('Before concatenation to dfTyped, df_numeric_columns: ', df_numeric_columns)
print('Before concatenation to dfTyped, df_scaled_numeric_columns: ', df_scaled_numeric_columns)
dfTyped = pd.concat(dfs, axis=1)
print('dfTyped: ', dfTyped)
print('dfTyped[Age]: ', dfTyped['Age'])

        # - With sklearn.preprocessing, preprocess your Dataframes before training model in the Python Script
        #     - [Guide at SciKit Learn site](https://scikit-learn.org/stable/modules/preprocessing.html)
        #     - Use OneHotEncoder
        #     - Use StandardScaler or  MinMaxScaler while you're at it
        #     - Don't worry about any other preprocessing to just get the training working
        #     - Strategy:
        #         - d Split dataframe into Numeric/Non-Categorial and Non-Numeric/Categorial columns
        #             - ! Use StandardScaler or MinMaxScaler on Numeric/Non-Categorical columns split
        #             - d Use OneHotEncoder on Non-Numeric/Categorical columns split

# Initial Data Frame is now preprocessed in dfPreprocessed
dfPreprocessed = dfTyped
print('dfPreprocessed: ', dfPreprocessed)

# Split DataFrame for training now that it is pre-processed
target_column_name = 'Survived'
df_x = dfPreprocessed.drop([target_column_name], axis=1)
df_y = dfPreprocessed.filter([target_column_name], axis=1)
print("See df_x", df_x)
print("See df_y", df_y)
# Register Pandas Dataframe of base df_x and df_y
Dataset.Tabular.register_pandas_dataframe(df_x, datastore, "Titanic Feature Column Data for train_test_split usage")
Dataset.Tabular.register_pandas_dataframe(df_y, datastore, "Titanic Target Column Data for train_test_split usage")

In [None]:
# Save feature names to pass to Experiment runner
features=[*df_numeric_column_names, *df_encoded_categorical_columns.columns]
# Encode features (names) list into a string like '["a","b"]'
for x in range(len(features)):
        features[x] = '"{}"'.format(features[x])
featuresEncoded = "[{}]".format(",".join(features))
print(featuresEncoded)

In [None]:
# For splitting of data into train and test set
from sklearn.model_selection import train_test_split

# Split the data
# What you need to pass to train_test_split...
# ... I need X and Y dataframe, X just with target missing, Y just with target column present
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y,
                                                    test_size=0.2,
                                                    random_state=0)

# Register the splits
Dataset.Tabular.register_pandas_dataframe(X_train, datastore, "Titanic Feature Column Data for training")
Dataset.Tabular.register_pandas_dataframe(X_test, datastore, "Titanic Feature Column Data for testing")
Dataset.Tabular.register_pandas_dataframe(y_train, datastore, "Titanic Target Column Data for training")
Dataset.Tabular.register_pandas_dataframe(y_test, datastore, "Titanic Target Column Data for testing")

In [None]:
# Run Training Experiment locally
from azureml.core import ScriptRunConfig
import datetime

# Experiment
experiment_name = 'Local_Training_AzureML'
experiment = Experiment(workspace=ws, name=experiment_name)

# Define Compute Cluster to use
compute_target = 'local'
source_directory = './scripts'
script_name = 'localTrainingAzureML.py'
dataset_name = 'automlbook Titanic Training Data A'
# set output file name like 'DecisionTreeClassifier_Titanic_local-2022-04-17 21:40:36.114550.pkl'
suffix = 'local-' + str(datetime.datetime.now())
suffix = suffix.replace(' ', '_') # Clean up datetimestamp
suffix = suffix.replace(':', '-') 
out_model_file_name = 'DecisionTreeClassifier_Titanic_{}.pkl'.format(suffix)
# set output file name like 'DecisionTreeClassifier_Titanic_local-2022-04-17 21:40:36.114550.pkl'

script_arguments = [
"--tenant-id", tenant_id,
"--ws-name", ws_name,
"--subscription-id", subscription_id,
"--resource-group", resource_group,
"--datastore-name", datastore_name,
"--out-model-file-name", out_model_file_name,
"--features", featuresEncoded
]
scriptRunConfig = ScriptRunConfig(
        source_directory=source_directory,
        script=script_name,
        arguments=script_arguments,
        environment=user_managed_env,
        compute_target=compute_target)
        
AzureML_run = experiment.submit(scriptRunConfig)
RunDetails(AzureML_run).show()

In [None]:
# Register Model from the AzureML_run
description = "Best Local AzureML Regression Run using Titanic Sample Data."
tags = {
  "project" : "Local Training AzureML", 
  "creator": "fox", 
  "task": "classification", 
  "dataset": "automlbook Titanic Training Data A", 
  "metric": "normalized_root_mean_squared_error"
}

# Attempt to register model once output model file is available
from azureml.core import Model
import sklearn
import time
while True:
  try:
    AzureML_run.register_model(model_path='./outputs', model_name=out_model_file_name, description=description, tags=tags,
                            model_framework=Model.Framework.SCIKITLEARN, # Framework used to create the model.
                            model_framework_version=sklearn.__version__)  # Version of scikit-learn used to create the model.)
    break
  except:
      print ("encountered exception registering model output file, waiting and trying again...") 
      time.sleep(60)
# set output file name like 'DecisionTreeClassifier_Titanic_local-2022-04-17 21:40:36.114550.pkl'

In [None]:
# Explain the Model with downloaded Explanation
from azureml.interpret import ExplanationClient

client = ExplanationClient.from_run(AzureML_run)

# get model explanation data
global_explanation = client.download_model_explanation()
# or only get the top k (e.g., 4) most important features with their importance values
# explanation = client.download_model_explanation(top_k=4)

global_importance_values = global_explanation.get_ranked_global_values()
global_importance_names = global_explanation.get_ranked_global_names()
print('global importance values: {}'.format(global_importance_values))
print('global importance names: {}'.format(global_importance_names))

In [None]:
# Visualize explanations
from raiwidgets import ExplanationDashboard
from azureml.core.model import Model
import joblib

# print('Model.list(ws)', Model.list(ws))

# Try 1: just Model constructor and joblib.load()
# downloaded_model = Model(ws, out_model_file_name)
# joblib.load(downloaded_model)

# Try 2: Use Model.get_model_path and joblib.load()
# remote_model_path = Model.get_model_path(out_model_file_name, _workspace=ws)
# downloaded_model = joblib.load(remote_model_path)

# Try 3: Use Model.download and joblib.load()
remote_model_obj = Model(ws, out_model_file_name)
print('Name:', remote_model_obj.name)
print('Version:', remote_model_obj.version)
remote_model_path = remote_model_obj.download(exist_ok = True)
downloaded_model = joblib.load(remote_model_path)

# TODO? get local explanation uploaded, downloaded, and visualized as well, 
#       for individual feature performance, etc...

# Be sure to pass dataset=(test feature columns Dataframe) and true_y=(test predicted column Dataframe)
ExplanationDashboard(global_explanation, downloaded_model, dataset=X_test, true_y=y_test)