In [None]:
import azureml.core
from azureml.core import Workspace, Dataset, Datastore, Environment
from azureml.core import Experiment
import azureml.interpret
from azureml.widgets import RunDetails

import pandas as pd
import numpy as np

print("SDK version:", azureml.core.VERSION)

In [None]:
# Get the Workspace object from Azure
from azureml.core.authentication import InteractiveLoginAuthentication

# You can find tenant id under azure active directory->properties
tenant_id = '198c7d8c-e010-45ce-a018-ec2d9a33f58f'

# Authenticate and get Workspace object
ia = InteractiveLoginAuthentication(tenant_id=tenant_id)
ws_name = 'automlbook'
subscription_id = '4d278f3d-b4fd-4fa2-86b6-d34b96bc888f'
resource_group = 'Foxy_Resources'
ws = Workspace.get(name=ws_name,
                   subscription_id=subscription_id,
                   resource_group=resource_group,
                   auth=ia)
print("After authenticating to the workspace with Interactive Authentication:", 
        "ws.name: " + ws.name, 
        "ws.resource_group: " + ws.resource_group, 
        "ws.location: " + ws.location, 
        "ws.subscription_id: " + ws.subscription_id, 
sep='\n')

In [None]:
# (Commmented example code to Auth with Service Principal)
# (Purely to Validate this will work inside script...) 

# TODO? maybe? I could build a custom image with BuildKit, with "Secret mount type", and then have Azure get it out of Docker Repository
# Store authentication strings
# Store username and password to Service Principal in order to authenticate within the python script
# CRITICAL: You must have a file like this at resources/custom_env_vars_for_script_inside_docker_container (from project root)...
#       # Set AzureML Service Principle ID and Password
#       AML_PRINCIPAL_ID="<Principal ID, AKA clientId>"
#       AML_PRINCIPAL_PASS="<Principal Password, AKA clientSecret>"

# Authenticate with the Service Principal in order to get the Workspace object
# from azureml.core.authentication import ServicePrincipalAuthentication
# sp = ServicePrincipalAuthentication(tenant_id=tenant_id,
#                                     service_principal_id=kv.get_secret(name="localDockerAmlPrincipalId"), # clientId of service principal
#                                     service_principal_password=kv.get_secret(name="localDockerAmlPrincipalPass")) # clientSecret of service principal
# ws = Workspace.get(name=ws_name,
#                    subscription_id=subscription_id,
#                    resource_group=resource_group,
#                    auth=sp)
# print("After re-authenticating to the workspace with Service Principal", 
#         "ws.name: " + ws.name, 
#         "ws.resource_group: " + ws.resource_group, 
#         "ws.location: " + ws.location, 
#         "ws.subscription_id: " + ws.subscription_id, 
# sep='\n')

In [None]:
# (Print statements to list available environments)
# envs = Environment.list(workspace=ws)

# for env in envs:
    # if env.startswith("AzureML"):
        # print("Name",env)
        # if None != envs[env].python.conda_dependencies:
            # print("packages", envs[env].python.conda_dependencies.serialize_to_string())


In [None]:
# Create datastore, try getting datastore via Workspace object
datastore = Datastore.get_default(ws)
datastore_name = 'workspaceblobstore'
datastore = Datastore.get(ws, datastore_name)

In [None]:
# Create a dataset from the datastore of the Workspace
dataset_name = 'Diabetes Sample Full Transform'
# TODO? perhaps use instead (and perform following two transformations): 'automlbook Diabetes Sample A'
#       dfRaw['AGE'].mask(dfRaw.AGE > AgeMean, AgeMean)
#       dfRaw['BMI'] = np.where(dfRaw['BMI'] > 30, 1, 0)
# dataset = Dataset.get_by_name(ws, dataset_name)
dataset = Dataset.get_by_name(ws, dataset_name, version = 'latest')
dataset_columns = ['AGE', 'SEX', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'Y']

# Show a sample of the data in the dataset
dataset.take(10).to_pandas_dataframe()

# Turn Dataset into Pandas Dataframe, it is to be preprocessed
df = dataset.to_pandas_dataframe()

In [None]:
# Partition Dataframe to get one for Numeric columns and one for Categorical columns

df_column_names = ['AGE', 'SEX', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6']

##### BEGIN Create dataframe with numeric columns so that it contains all numbers that are preprocessed...
df_numeric_column_names = ['AGE', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6']
# Copy df, keeping only float numeric columns, set type of this df copy to float
df_float_column_names = ['BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6']
df_float_columns = pd.DataFrame(df[df_float_column_names], dtype=np.float, columns=df_float_column_names)
# # Copy df, keeping only integer Age column to leave as an integer
df_integer_column = pd.DataFrame(df['AGE'], columns=['AGE'])
# Concatenate the numeric DataFrames
df_numeric_columns = pd.concat([df_integer_column, df_float_columns], axis=1)


##### BEGIN Create dataframe with categorical columns so that it contains all categorical data that is preprocessed...
df_categorical_column_names = ['SEX']
# Copy df, keeping only categorical columns
df_categorical_columns = pd.DataFrame(df[df_categorical_column_names], dtype=np.str, columns=df_categorical_column_names)

print('concatenated df_numeric_columns: ', df_numeric_columns)
print('df_categorical_columns: ', df_categorical_columns)

feature_column_names = [*df_numeric_column_names, *df_categorical_column_names]
print(feature_column_names)

In [None]:
# Combine the numeric DF with the categorical DF
# print("df['Y'] is ", df['Y'])
# print("df_numeric_columns is ", df_numeric_columns)
# print("df_numeric_columns.columns is ", df_numeric_columns.columns)
# print("df_categorical_columns is ", df_categorical_columns)
# print("df_categorical_columns.columns is ", df_categorical_columns.columns)

# Concatenate dfs to get DataFrame of all columns to submit to the training script
dfs = [df['Y'], df_numeric_columns, df_categorical_columns]
# print("dfs is" + str(dfs))
# print('Before concatenation to dfTyped, df[\'Y\']: ', df['Y'])
# print('Before concatenation to dfTyped, df_numeric_columns: ', df_numeric_columns)
dfTyped = pd.concat(dfs, axis=1)
print('dfTyped: ', dfTyped)

In [None]:
# Split pre-transformation Data Frame into feature/target columns
target_column_name = 'Y'
df_x = dfTyped.drop([target_column_name], axis=1)
df_y = dfTyped[target_column_name].to_frame()
# print("See df_x", df_x)
print("See df_y", df_y)

In [None]:
# Create preprocessor to preprocess numeric and categorical columns (with Transfomer API via ColumnTransformer)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', one_hot_encoder)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, df_numeric_column_names),
        ('cat', categorical_transformer, df_categorical_column_names)])

In [None]:
# (Validate the Pipeline will function once passed to the script)

# Get the preprocessed Data Frame columns in a list
one_hot_encoder.fit(df_categorical_columns)
# Get new One Hot Encoded column names
encoded_categorical_column_names = one_hot_encoder.get_feature_names(df_categorical_column_names)
encoded_feature_names = [*df_numeric_column_names, *encoded_categorical_column_names]
print(str(encoded_feature_names))

preprocessor.fit(df_x)

In [None]:
# TODO Apply the preprocessor to the columns
print(df_x)
df_x = preprocessor.transform(df_x)
df_x = pd.DataFrame(df_x, columns=encoded_feature_names)
print(df_x)

In [None]:
# Register Pandas Dataframe of base df_x and df_y
Dataset.Tabular.register_pandas_dataframe(df_x, datastore, "Diabetes Feature Column Data for train_test_split usage (Docker Environment)")
Dataset.Tabular.register_pandas_dataframe(df_y, datastore, "Diabetes Target Column Data for train_test_split usage (Docker Environment)")

In [None]:
# TODO remove me # Append regressor to preprocessing pipeline (Then we have a full prediction pipeline)
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor()

regressor_pipeline = Pipeline(steps=[('regressor', regressor)])
# The regressor_pipeline is later de-pickled and used in the Experiment Run's python script

# Now we have a full prediction pipeline.
print(regressor_pipeline.__dict__)
# Note that you may access regressor_pipeline ColumnTransformers as follows...
# print(regressor_pipeline['preprocessor'].transformers[1][1][1].get_feature_names(df_categorical_column_names))

In [None]:
# TODO remove me # Pickle the SciKit Learn Pipeline to pass it to the script
# Link about pickling:
# https://codefather.tech/blog/python-pickle/#:~:text=%20Python%20Pickle%3A%20Serialize%20Your%20Objects%20%20,The%20pickle%20module%20also%20allows%20to...%20More%20
import pickle

pickled_pipeline = pickle.dumps(regressor_pipeline)
# print(pickle.loads(pickled_pipeline))

import os
os.makedirs('./scripts/resources', exist_ok=True)
with open('./scripts/resources/regressor_pipeline.pickle', 'wb') as file:
    pickle.dump(regressor_pipeline, file)

# You may copy this to the script and Unpickle the SciKit Pipline (that performs Transformation and Model Training)
# with open('regressor_pipeline.pickle', 'rb') as file:
#     unpickled_pipeline = pickle.load(file)
#     print(unpickled_pipeline)

In [None]:
# Split data into training and test data, register the resulting Datasets with Azure
from sklearn.model_selection import train_test_split

# Split the data
# What you need to pass to train_test_split...
# ... I need X and Y dataframe, X just with target missing, Y just with target column present
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y,
                                                    test_size=0.2,
                                                    random_state=0)
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)
print(test_data)

# print("See y_test", y_test)
# print("See y_test.columns.tolist()", str(y_test.columns.tolist()))
# print("See y_test.values.tolist() to pass as true_ys to ExplanationClient.upload_model_explanation()", y_test.values.tolist())
# print("See y_test.values.ravel() to pass as true_ys to ExplanationClient.upload_model_explanation()", y_test.values.ravel())
# print("See y_test.values.tolist().flatten() to pass as true_ys to ExplanationClient.upload_model_explanation()", y_test.values.flatten())

# Register the split (by whether target column) training and test datasets
X_train_registered_name = "Diabetes Feature Column Data for training (Docker Environment)"
X_test_registered_name = "Diabetes Feature Column Data for testing (Docker Environment)"
y_train_registered_name = "Diabetes Target Column Data for training (Docker Environment)"
y_test_registered_name = "Diabetes Target Column Data for testing (Docker Environment)"
# Dataset.Tabular.register_pandas_dataframe(X_train, datastore, X_train_registered_name)
Dataset.Tabular.register_pandas_dataframe(X_test, datastore, X_test_registered_name)
# Dataset.Tabular.register_pandas_dataframe(y_train, datastore, y_train_registered_name)
Dataset.Tabular.register_pandas_dataframe(y_test, datastore, y_test_registered_name)

# Register the combined (feature and target columns) training and test datasets
train_data_registered_name = "Diabetes Training Data (Docker Environment)"
test_data_registered_name = "Diabetes Training Test Data (Docker Environment)"
Dataset.Tabular.register_pandas_dataframe(train_data, datastore, train_data_registered_name)
Dataset.Tabular.register_pandas_dataframe(test_data, datastore, test_data_registered_name)

trainTestDataSetNames = [X_train_registered_name, X_test_registered_name, y_train_registered_name, y_test_registered_name, train_data_registered_name, test_data_registered_name]

In [None]:
# Save feature names and create TabularExplainer with them
features=[*df_numeric_column_names, *encoded_categorical_column_names]

In [None]:
# Encode Experiment script arguments list into a string like '["a","b"]'

# Encode numeric column names list
temp_column_names = df_numeric_column_names.copy()
for x in range(len(temp_column_names)):
        temp_column_names[x] = '"{}"'.format(temp_column_names[x])
numericFeatureNamesEncoded = "[{}]".format(",".join(temp_column_names))
# print("numericFeatureNamesEncoded:", numericFeatureNamesEncoded)

# Encode categoric column names list
temp_column_names = encoded_categorical_column_names.copy()
for x in range(len(temp_column_names)):
        temp_column_names[x] = '"{}"'.format(temp_column_names[x])
categoricFeatureNamesEncoded = "[{}]".format(",".join(temp_column_names))
# print("categoricFeatureNamesEncoded:", categoricFeatureNamesEncoded)

# Encode split dataset names list
for x in range(len(trainTestDataSetNames)):
        trainTestDataSetNames[x] = '"{}"'.format(trainTestDataSetNames[x])
trainTestDataSetNamesEncoded = "[{}]".format(",".join(trainTestDataSetNames))
print("splitDatasetNamesEncoded:", trainTestDataSetNamesEncoded)

In [None]:
# Set Local Docker Environment up (with System Managed Dependencies, via Conda)
#
# Learn about Environment and how to use a Docker Environment here:
#       https://docs.microsoft.com/en-us/azure/machine-learning/concept-environments
#       https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.environment(class)?view=azure-ml-py
#       ! IMPORTANT: https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/environment/
#
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DockerConfiguration

docker_env = Environment("docker-env")
# Editing a run configuration property on-fly.
docker_env.python.user_managed_dependencies = False
# Use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param.
#           https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.runconfig.dockerconfiguration?view=azure-ml-py
docker_config = DockerConfiguration(use_docker=True)
print("initial base image from base docker-env Environment: ", docker_env.docker.base_image)

# Specify docker steps as a string. 
dockerfile = r"""
FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20220314.v1

ARG DEBIAN_FRONTEND=noninteractive
ARG ACCEPT_EULA=Y
RUN ls -altr
RUN apt-get update -y && apt-get upgrade -y &&\
    apt-get install -y build-essential \
                       cmake \
                       curl \
                       gfortran \
                       git \
                       jupyter \
                       libatlas-base-dev \
                       libblas-dev \
                       libbz2-dev \
                       libffi-dev \
                       libgdbm-dev \
                       liblapack-dev \
                       liblzma-dev \
                       libncurses5-dev \
                       libncursesw5-dev \
                       libreadline-dev \
                       libsqlite3-dev \
                       libssl-dev \
                       libxml2-dev \
                       libxmlsec1-dev \
                       llvm \
                       lzma \
                       lzma-dev \
                       make \
                       tcl-dev \
                       tk-dev \
                       wget \
                       xz-utils \
                       zlib1g-dev

RUN conda -V
RUN echo "Hello from custom container!" > ~/hello.txt
RUN pip install azureml.interpret azureml-dataset-runtime azureml.train azureml-train-automl jinja2 MarkupSafe raiwidgets python-dotenv pybridge 
RUN export PIP_LOG="/tmp/pip_log.txt" && touch ${PIP_LOG} && tail -f ${PIP_LOG} & conda env create -f "conda.yml" && killall tail && rm ${PIP_LOG}
"""

# NOTE: you can pass Dockerfile string to docker build command via stdin like this:
#
# sudo docker build -t myimage:latest -<<EOF
# FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20220314.v1
# RUN echo "hello world"
# EOF


# Set base image to None, because the image is defined by dockerfile.
docker_env.docker.base_image = None
# Use the Dockerfile string and build image based on it with this code from above (move it down here)
docker_env.docker.base_dockerfile = dockerfile

#       For help, try reading this
#                    - https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/environment/#custom-docker-image--dockerfile
#       May also try using other base Docker images from these container registries
#                - https://github.com/microsoft/containerregistry
#                - https://github.com/Azure/AzureML-Containers
# env.docker.base_image = '<image-name>'
# env.docker.base_image_registry.address = '<container-registry-address>'
# env.docker.base_image_registry.username = '<acr-username>'
# env.docker.base_image_registry.password = os.environ.get("CONTAINER_PASSWORD")
#
# TODO? use this link to get username and password from Azure KeyVault:
#           https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/environment/#use-keyvault-to-pass-secrets
# TODO? use this code to set username and password:
# # Retrieve username and password from the workspace key vault
#       env.docker.base_image_registry.username = ws.get_default_keyvault().get_secret("username")  
#       env.docker.base_image_registry.password = ws.get_default_keyvault().get_secret("password")

In [None]:
# Specify conda dependencies with scikit-learn
conda_packages = ['pip',
                  'pyspark',
                  'scikit-learn'
                 ]
# TODO get 'azureml-train-automl' installed, the docker build times out for some reason when I added that to pip_packages
#           Help: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-troubleshoot-environments
# TODO try removing packages until maybe the docker image can build, 
#  (wow, getting stopped at installed azureml.train.automl in environment...)
pip_packages =   ['azureml.interpret',
                  'azureml-dataset-runtime',
                  'azureml.train',
                  'azureml.train.automl',
                  'jinja2',
                  'MarkupSafe',
                  'raiwidgets',
                  'python-dotenv',
                  'pybridge'
                 ]

condaDependencies = CondaDependencies.create(conda_packages=conda_packages, pip_packages=pip_packages)
docker_env.python.conda_dependencies = condaDependencies

In [None]:
# Register the Docker Environment and build the Docker image locally
registered_docker_env = docker_env.register(ws)
print(registered_docker_env)
# Need to enable non-root docker user usage of docker for this local build of the image to work, see guide:
#           (Actually, this was not enough it seems, I am getting no feedback here, hm.)
#           (Then I tried: sudo chmod 777 /var/run/docker.sock, I think 770 is enough because of the docker group owning /var/run/docker.sock at 660 initially)
      # (https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user)

registered_docker_env.save_to_directory('environment_out', overwrite=True)
# If this fails, make sure docker service is running
registered_docker_env.build_local(ws, useDocker=True, pushImageToWorkspaceAcr=True)

In [None]:
# TODO Have the scriptRunConfig be used to run an AutoML performing script 
# Prepare to run AutoML Training Experiment in Docker Environment (With Docker running on local device)
# TODO? maybe turn off this featurization to leave previous featurization or to enhance featurization yourself

from azureml.core import ScriptRunConfig
import datetime

# Define Compute Cluster to use
compute_target = 'local'
source_directory = './scripts'
script_name = 'diabetesDockerRegressionTrainingAutoML.py'
dataset_name = 'Diabetes Sample Full Transform'
# set output file name like 'DecisionTreeRegressor_Diabetes_Docker-2022-04-17 21:40:36.114550.pkl'
suffix = 'local-' + str(datetime.datetime.now())
suffix = suffix.replace(' ', '_') # Clean up datetimestamp
suffix = suffix.replace(':', '-') 
out_model_file_name = 'AutoMLRegression_BestModel_Diabetes_Docker_{}.pkl'.format(suffix)
# set output file name like 'DecisionTreeRegressor_Diabetes_Docker-2022-04-17 21:40:36.114550.pkl'

script_arguments = [
"--tenant-id", tenant_id,
"--ws-name", ws_name,
"--subscription-id", subscription_id,
"--resource-group", resource_group,
"--datastore-name", datastore_name,
"--out-model-file-name", out_model_file_name,
"--numeric-feature-names", numericFeatureNamesEncoded,
# Pass list encoded as a comma-separated string, containing the raw feature names
# like '["a","b"]'
"--categoric-feature-names", categoricFeatureNamesEncoded,
# Pass list encoded as a comma-separated string, containing the name of each dataset (X_train, X_test, y_train, y_test)
# like '["a","b"]'
"--x-train-test-y-train-test-combined-train-test", trainTestDataSetNamesEncoded 
]
print("ScriptRunConfig arguments: ", script_arguments)
scriptRunConfig = ScriptRunConfig(
        source_directory=source_directory,
        script=script_name,
        arguments=script_arguments,
        environment=registered_docker_env,
        docker_runtime_config=docker_config)

In [None]:
# Submit Experiment Run to Docker environment
#
# (see more on use of Docker environment: 
#   https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training/train-on-local/train-on-local.ipynb
# )
import subprocess
# TODO? Add minconda bin to path within docker container? "/home/johna/miniconda3/bin:"

# import getpass
# If you need, you can get a password from user input (Notebook pauses to show a prompt here)
# password = getpass.getpass()

# Check if Docker is installed and Linux containers are enabled
if subprocess.run("docker -v", shell=True).returncode == 0:
    subprocess.run("service docker status", shell=True)


    # This StackOverflow page will help you run the docker commands with sudo if that is necessary for you: 
    # https://askubuntu.com/questions/155791/how-do-i-sudo-a-command-in-a-script-without-being-asked-for-a-password
    #
    # (NOTE: These snippets from the link will allow you to have the docker commands run with sudo)
    #
    # Create script with the command to run with sudo, for example:
    # docker system info
    #
    # Run sudo chown and chmod commands to grant access of the file to root 
    # sudo chown root:root ~/docker_system_info.sh
    # sudo chmod 700 ~/docker_system_info.sh
    #
    # Run sudo visudo and insert a line below the line `%sudo   ALL=(ALL:ALL) ALL`
    # [username]  ALL=(ALL) NOPASSWD: /home/[username]/docker_system_info.sh
    #
    # Then call the python script with your python subprocess command
    # p = subprocess.run('sudo ~/docker_system_info.sh', shell=True)

    # out_docker_system_info = subprocess.run("docker system info", shell=True)
    out_docker_system_info = subprocess.check_output('~/docker_system_info.sh', shell=True).decode('ascii')
    # out_docker_system_info = subprocess.check_output('sudo su && ~/docker_system_info.sh', shell=True).decode('ascii')
    print(out_docker_system_info)
        #           [Install Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
        #           [Uninstall Docker Engine](https://docs.docker.com/engine/install/ubuntu/#uninstall-docker-engine)
        #           WARNING! When I ran this command there is a failure to uninstall and purge all docker engine apt-get packages `sudo apt-get purge docker-ce docker-ce-cli containerd.io docker-compose-plugin`
        #                   AND TO RESOLVE (EXTRA CAPS) !!!THIS!!! THING, I DID THE FOLLOWING...
        #
        # Get /usr/bin/docker: Permission denied? (was not enough for me)
        #       See (https://adamtheautomator.com/docker-permission-denied/)
        #           (perhaps I need apt dependencies added to Dockerfile string I passed(??))
        #       Along with  (https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/environment/)
        # TODO! update for my laptop: still not getting any logs past "nvidia-docker is installed on the target. Using nvidia-docker for docker operations"
        # 
        # NOTE - NEXT... Try using custom image from your own Docker image repository
        #                   (perhaps allow docker to run by johna user normally, without any black magic)
        # 
        # LAST DITCH EFFORTS - If you are stuck, uninstall and install docker, then install different Linux version for WSL, then try reinstall WSL
        #
        # NOTE - How to get past error: (perhaps uninstall nvidia-docker)
        #           nvidia-docker is installed on the target. Using nvidia-docker for docker operations.
        # 
        # You may want to follow this guide to install the Docker engine into Ubuntu:
        #           (https://docs.docker.com/engine/install/ubuntu/)
        #   Post install steps:
        #           (https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user)
        #   Alternative way to install:
        #            https://github.com/docker/docker-install
        #
        # Run Experiment in Docker environment
        # (NOTE: If you get any errors, in AMLS go to Jobs -> Click Experiment then Run from list -> Look for "Environment", 
        #           There should be a hyperlink to a page for the Environment used for that run!
        #           There should be a Docker Build Log you can access
        #           You should be able to trigger a build of the Docker image from the Environment's main page
        # )
    experiment_name = 'Diabetes_Docker_Regression_Training_AutoMLScriptRun'
    experiment = Experiment(workspace=ws, name=experiment_name)
    # NOTE: I previously got an error message including "GPU", because of a --gpu flag used the instructions at this link to get past that:
    #           (https://docs.nvidia.com/cuda/wsl-user-guide/index.html)
    # NOTE: If script is failing at authentication, follow this link for help:
    #           (https://docs.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication)

    # ScriptRunConfig usage to create a Run
    ScriptRunConfig_run = experiment.submit(scriptRunConfig)
    RunDetails(ScriptRunConfig_run).show()

In [None]:
# (While Experiment script runs, Validate the Engineered Feature Explanation will function inside the script)

# Split training features into numeric and categoric dataframes
# numeric_X_test = pd.DataFrame(X_test[df_numeric_column_names], dtype=np.str, columns=df_numeric_column_names)
# categoric_X_test = pd.DataFrame(X_test[df_categorical_column_names], dtype=np.str, columns=df_categorical_column_names)
# # Fit and Run the numeric and categoric ColumnTransformers on the split dataframes to perform feature engineering
# preprocessor.transformers[0][1].fit(numeric_X_test)
# preprocessor.transformers[1][1].steps[0][1].fit(categoric_X_test)
# numeric_X_test_preprocessed = preprocessor.transformers[0][1].transform(numeric_X_test)
# numeric_X_test_preprocessed = pd.DataFrame(numeric_X_test_preprocessed, dtype=np.float, columns=df_numeric_column_names)
# categoric_X_test_preprocessed = preprocessor.transformers[1][1].steps[0][1].transform(categoric_X_test)
# # Fit OneHotEncoder
# preprocessor.transformers[1][1].steps[1][1].fit(categoric_X_test_preprocessed)
# # Get new One Hot Encoded column names
# print(categoric_X_test_preprocessed)
# print(df_categorical_column_names)
df_encoded_categorical_column_names = preprocessor.transformers[1][1].steps[1][1].get_feature_names(df_categorical_column_names)
# print("df_encoded_categorical_column_names", df_encoded_categorical_column_names)
# # Transform categoric, null-imputed features with fitted OneHotEncoder
# categoric_X_test_preprocessed = preprocessor.transformers[1][1].steps[1][1].transform(categoric_X_test_preprocessed)
# # Turn preprocessed categoric features into a DataFrame
# categoric_X_test_preprocessed = pd.DataFrame(categoric_X_test_preprocessed, dtype=np.float64, columns=df_encoded_categorical_column_names)

# # Combine the numeric DF with the categorical DF to submit to the AutoML training experiment
# X_test_preprocessed_list = [numeric_X_test_preprocessed, categoric_X_test_preprocessed]
# X_test_preprocessed = pd.concat(X_test_preprocessed_list, axis=1)


# Save engineered features' names to create TabularExplainer with them
engineeredFeatures=[*df_numeric_column_names, *df_encoded_categorical_column_names]
print(engineeredFeatures)

In [None]:
# TODO Register best Model from the AutoML run
description = "Best AutoML Regression Run using Docker Environment and Diabetes Sample Data. This model limits Age values to 3 std from mean. This model sets BMI > 30 to 1, BMI <= 30 to 0."
tags = {"project" : "AutoML Book Diabetes (Docker)", "creator": "fox", "task": "regression", "dataset": "Diabetes Sample Full Transform", "metric": "normalized_root_mean_squared_error"}


# Attempt to register model once output model file is available
from azureml.core import Model
import sklearn
import time
while True:
  try:
    ScriptRunConfig_run.wait_for_completion()
    # Register Model from the ScriptRunConfig_run
    ScriptRunConfig_run.register_model(model_path='./outputs', model_name=out_model_file_name, description=description, tags=tags)
    break
  except:
      print ("encountered exception registering model output file, waiting and trying again...") 
      time.sleep(60)

In [None]:
# Get and plot output Best Model metrics

# Get all metris logged in the run
metrics = ScriptRunConfig_run.get_metrics()
print("metrics: ", metrics)

# Get the metrics that were logged from the run of the training script
print("metrics['r2']: " + str(metrics['r2']))
print("metrics['rsme']: " + str(metrics['rsme']))

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.plot(metrics['r2'], metrics['rsme'], marker='o')
plt.ylabel("rsme")
plt.xlabel("r2")
#
# You can also list all the files that are associated with this run record
#
print("Here are the files associated with the Azure AutoML Run: ", ScriptRunConfig_run.get_file_names())

In [None]:
# For the Experiment Run that was ran, Get Global Explanations (Downloaded with the Experiment Run object!)
# IMPORTANT: this step will fail until you copy the explanation_id values from the printed list of explanations
from azureml.interpret import ExplanationClient

client = ExplanationClient.from_run(ScriptRunConfig_run)

# Get model explanation data
# TODO? There should be a way to download_model_explanation by comment instead of id.
print(client.list_model_explanations())
# IMPORTANT: this step will fail until you copy the explanation_id values from the printed list of explanations
engineered_global_explanation_test = client.download_model_explanation(explanation_id='7e81718e-4481-4dcb-a269-0db977df3436')
# engineered_global_explanation_train = client.download_model_explanation(explanation_id='ab8b34cd-6f16-4718-b148-08e2635110e4')
# global_explanation = client.download_model_explanation(explanation_id='86861485-00ae-42f0-8bab-3ae41556c6a9')



# Or only get the top k (e.g., 4) most important features with their importance values
# explanation = client.download_model_explanation(top_k=4)

global_importance_values = engineered_global_explanation_test.get_ranked_global_values()
global_importance_names = engineered_global_explanation_test.get_ranked_global_names()
print('global importance values: {}'.format(global_importance_values))
print('global importance names: {}'.format(global_importance_names))

In [None]:
# Download the Model from AzureML and Visualize Explanations with it # TODO? Get this cell working
from raiwidgets import ExplanationDashboard
from azureml.core.model import Model
import joblib
import jinja2

# print('Model.list(ws)', Model.list(ws))

# Download the Model from Azure

# Use Model.download and joblib.load()
remote_model_obj = Model(ws, out_model_file_name)
print('Name:', remote_model_obj.name)
print('Version:', remote_model_obj.version)
remote_model_path = remote_model_obj.download(exist_ok = True)
downloaded_model = joblib.load(remote_model_path)

# BEGIN Access "Local Explanations", uncomment these lines if you want to do that here...
# (Local Explanation meaning "of individual predictions") 
from interpret.ext.blackbox import TabularExplainer
# "features" and "classes" fields are optional
explainer = TabularExplainer(downloaded_model,
                             X_test,
                             features=features)

# Get explanation for the first few data points in the test set
local_explanation = explainer.explain_local(X_test[0:5])
# Sorted feature importance values and feature names
sorted_local_importance_names = local_explanation.get_ranked_local_names()
print('sorted_local_importance_names: ', sorted_local_importance_names)
print('len(sorted_local_importance_names): ', len(sorted_local_importance_names))
sorted_local_importance_values = local_explanation.get_ranked_local_values()
print('sorted_local_importance_values: ', sorted_local_importance_values)
print('len(sorted_local_importance_values): ', len(sorted_local_importance_values))
# COOL THING TO DO: Sometime could get local explanation of specific data points uploaded, downloaded, and visualized as well...
# END Access "Local Explanations"

# Visualize explanations
# Be sure to pass dataset=(test feature columns Dataframe) and true_y=(test predicted column Dataframe)
#       1) getting the raiwidgets thing working
#       2) see README at https://github.com/interpretml/interpret
ExplanationDashboard(engineered_global_explanation_test, downloaded_model, dataset=X_test, true_y=y_test)