# Walgreens Boots AML & Pipeline

In [None]:
import azureml.core
azureml.core.VERSION

# Setup

These cells should be run in advance and not be visible during the demo.

In [None]:
# Global constants
subscription_id = '' 
resource_group  = ''
workspace_name  = ''
experiment_name = 'walgreens-boots-propensity'
cluster_name = 'cpucluster'
project_folder = 'scripts'

PRODUCT_CATEGORIES = 10

# AML

In [None]:
# Key open source data analysis packages
from IPython.display import display, HTML

import json
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

%matplotlib inline
sns.set(color_codes='True')

In [None]:
# Working directories
if not os.path.exists(project_folder):
    os.makedirs(project_folder)

In [None]:
%%writefile $project_folder/get_data.py

import pandas as pd
from sklearn.preprocessing import LabelEncoder

def get_data():
    df = pd.read_csv('/tmp/azureml_runs/boots/data-latest.csv')

    le = LabelEncoder()
    le.fit(df['BOUGHT_CATEGORY_FNN'].values)
    y = le.transform(df['BOUGHT_CATEGORY_FNN'].values)

    df = df.drop(['BOUGHT_CATEGORY_FNN'], axis=1)

    return { "X" : df, "y" : y }

In [None]:
%%writefile $project_folder/register.py

from azureml.core import Workspace
from azureml.core.model import Model
from azureml.core import Run
import argparse
import json

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name',
        type=str,
        default='',
        help='Variant name you want to give to the model.'
    )
    parser.add_argument(
        '--model_path',
        type=str,
        default='outputs',
        help='Location of trained model.'
    )

    args, unparsed = parser.parse_known_args()
    print(args.model_name)
    print(args.model_path)
    
    run = Run.get_context()
    ws = run.experiment.workspace
    
    tags = {
        "runId": str(run.id)
    }

    print(json.dumps(tags))

    model = Model.register(ws, model_name = args.model_name, model_path = args.model_path, tags=tags)

    print('Model registered: {} \nModel Description: {} \nModel Version: {}'.format(model.name, model.description, model.version))

In [None]:
# Read the data in to analyze
df = pd.read_csv('./data-latest.csv')

# Re-order columns for demo
props = list(filter(lambda c: not c.startswith('BOUGHT') and re.match(r'CATEGORY_\d+', c) == None, df.columns))
value = list(filter(lambda c: re.match(r'CATEGORY_\d+', c) != None, df.columns))
df = df[props + value + ['BOUGHT_CATEGORY_FNN']]
df = df[df['BOUGHT_CATEGORY_FNN'] != 'U']

In [None]:
# Setup to Azure Machine Learning
from azureml.core import Run
from azureml.core.compute import AksCompute, ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.container_registry import ContainerRegistry
from azureml.core.experiment import Experiment
from azureml.core.runconfig import DataReferenceConfiguration, RunConfiguration
from azureml.core.webservice import AciWebservice
from azureml.core.workspace import Workspace
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData, PublishedPipeline, PipelineRun, Schedule, TrainingOutput
from azureml.pipeline.core.graph import PipelineParameter
from azureml.pipeline.steps import PythonScriptStep
from azureml.train.automl import AutoMLConfig, AutoMLStep
from azureml.train.automl.runtime.automlexplainer import retrieve_model_explanation
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails

import azureml

# Connect to Azure Machine Learning
try:
    ws = Workspace.from_config()
except:
    ws = Workspace(subscription_id = subscription_id,
                   resource_group = resource_group,
                   workspace_name = workspace_name)
    ws.write_config()
    
    print('Workspace config file written')
    
output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data=output, index=['']).T

In [None]:
# Cleanup previously configured schedules
schedules = Schedule.list(ws)
for s in schedules:
    s.disable(wait_for_provisioning=True)

# Data

In [None]:
# Preview available columns
df.head(10)

In [None]:
# Analyze distribution of ages in the dataset
sns.distplot(df[['AGE']], bins=[10,20,30,40,50,60,70,80,90,100])

In [None]:
# Analyze distribution of spend in category #1
sns.distplot(df[('CATEGORY_FNN_SPEND')])

In [None]:
# Analyze how age influences whether customers have responded to category #1 campaigns
g = sns.FacetGrid(df, col='BOUGHT_CATEGORY_FNN')
g.map(sns.distplot, 'AGE')

In [None]:
# Analyze how gender influences whether customers have responded to category #1 campaigns
g = sns.FacetGrid(df, col='BOUGHT_CATEGORY_FNN')
g.map(sns.countplot, 'GENDER')

In [None]:
# Analyze how age and category #1 & #2 spend influences responding to category #1 campaigns
sns.pairplot(df[['AGE', 'CATEGORY_FNN_SPEND', 'CATEGORY_WLN_SPEND', 'BOUGHT_CATEGORY_FNN']], hue='BOUGHT_CATEGORY_FNN')

# Setup our AML environment

In [None]:
# Provision a compute target
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
                                                           min_nodes=1,
                                                           max_nodes=12)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True)

compute_target.status.serialize()

In [None]:
# Upload our data
ds = ws.get_default_datastore()
ds.upload_files(['./data-latest.csv'], target_path = 'boots', overwrite=True)

# Experiment

In [None]:
experiment = Experiment(ws, experiment_name)

In [None]:
dr = DataReferenceConfiguration(datastore_name=ds.name, 
                                path_on_compute='/tmp/azureml_runs',
                                path_on_datastore='boots',
                                mode='download',
                                overwrite=False)

# Create the RunConfiguration object, responsible for the configuration of the execution environment
run_config = RunConfiguration(framework="python")
run_config.target = compute_target
run_config.data_references = {ds.name: dr}
run_config.environment.docker.enabled = True

automl_config = AutoMLConfig(task = 'classification',
                             iterations = 25,
                             iteration_timeout_minutes = 5, 
                             max_cores_per_iteration = 4,
                             max_concurrent_iterations = 12,
                             primary_metric = 'accuracy',
                             data_script = project_folder + '/get_data.py',
                             run_configuration = run_config,
                             path = project_folder,
                             model_explainability = True,
                             n_cross_validations = 2,
                             preprocess = True)

In [None]:
remote_run = experiment.submit(automl_config, show_output=False)
remote_run

In [None]:
remote_run.wait_for_completion(show_output=True)

# Review

In [None]:
# Easily explore results using interactive widgets
RunDetails(remote_run).show()

In [None]:
# Programmatically find the best model based on different metrics
lookup_metric = 'accuracy'
best_run, fitted_model = remote_run.get_output(metric = lookup_metric)
print(best_run)
print(fitted_model)

In [None]:
# Get best run explanation data
shape_values, expected_values, overall_summary, overall_imp, per_class_summary, per_class_imp = \
    retrieve_model_explanation(best_run)

In [None]:
feat = pd.DataFrame([per_class_summary[2]], columns = per_class_imp[2], index = ['Importance'])
with sns.plotting_context('notebook', font_scale=1.4):
    plt.subplots(figsize=(13,9))
    sns.barplot(data=feat.iloc[:10,:10], orient='h').set_title('Key factors for purchase')

In [None]:
# Store the preferred model for your team to use
model = best_run.register_model(model_name = 'category_fnn_model.pkl',
                                model_path = 'outputs/model.pkl',
                                tags = {'area': 'CATEGORY FNN', 'type': 'classification'})
print(model.name, model.version)

In [None]:
# Captures training code, dataset, and run when stored
model.serialize()

# Pipelines

In [None]:
# Key open source data analysis packages
from IPython.display import display, HTML

import json
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

%matplotlib inline
sns.set(color_codes='True')

In [None]:
# Working directories
if not os.path.exists(project_folder):
    os.makedirs(project_folder)

# Data functions
def get_cat(index, lower=False):
    if index == 0:
        cat = 'FNN'
        cat = cat if not lower else cat.lower()
    elif index == 1:
        cat = 'WLN'
        cat = cat if not lower else cat.lower()
    else:
        cat = index + 1
    
    return cat

def prep_data_file(index):
    with open('{}/get_data.py'.format(project_folder)) as f:
        content = f.read()

    cat = get_cat(index)
    content = content.replace('BOUGHT_CATEGORY_FNN', 'BOUGHT_CATEGORY_{}'.format(cat))
    
    cat = get_cat(index, lower=True)
    cat_folder = '{}/c_{}'.format(project_folder, cat)
    if not os.path.exists(cat_folder):
        os.makedirs(cat_folder)

    with open('{}/get_data.py'.format(cat_folder), 'w') as f:
        f.write(content)

# Generate experiment scripts
for i in range(PRODUCT_CATEGORIES):
    prep_data_file(i)

In [None]:
# Read the data in to analyze
df = pd.read_csv('./data-latest.csv')

# Re-order columns for demo
props = list(filter(lambda c: not c.startswith('BOUGHT') and re.match(r'CATEGORY_\d+', c) == None, df.columns))
value = list(filter(lambda c: re.match(r'CATEGORY_\d+', c) != None, df.columns))
df = df[props + value + ['BOUGHT_CATEGORY_FNN']]

In [None]:
# Cleanup previously configured schedules
schedules = Schedule.list(ws)
for s in schedules:
    s.disable(wait_for_provisioning=True)

In [None]:
experiment = Experiment(ws, experiment_name)

## Setup Pipeline

<img src="https://wbademobuild0187399067.blob.core.windows.net/images/Pipelines.gif">

In [None]:
# Re-use our experiment configuration
input_data = DataReference(datastore=ds, 
                           data_reference_name='training_data',
                           path_on_datastore='boots',
                           mode='download',
                           path_on_compute='/tmp/azureml_runs',
                           overwrite=True)

run_config = RunConfiguration(framework="python")
run_config.target = compute_target
run_config.environment.docker.enabled = True

In [None]:
# Build a pipeline
steps = []
current = None

# Build a model for every category
for i in range(PRODUCT_CATEGORIES):
    cat = get_cat(i, lower=True)
    
    # These are the two outputs from AutoML
    metrics_data = PipelineData(name='metrics_data_category_{}'.format(cat),
                                datastore=ds,
                                pipeline_output_name='metrics_output_category_{}'.format(cat),
                                training_output=TrainingOutput(type='Metrics'))

    model_data = PipelineData(name='model_data_category_{}'.format(cat),
                              datastore=ds,
                              pipeline_output_name='best_model_output_category_{}'.format(cat),
                              training_output=TrainingOutput(type='Model'))

    # AutoML config (note different data files for each model so it's not shared)
    automl_config = AutoMLConfig(task = 'classification',
                                 iterations = 25,
                                 iteration_timeout_minutes = 5, 
                                 max_cores_per_iteration = 2,
                                 max_concurrent_iterations = 8,
                                 primary_metric = 'accuracy',
                                 data_script = '{}/c_{}/get_data.py'.format(project_folder, cat),
                                 run_configuration = run_config,
                                 compute_target = compute_target,
                                 path = project_folder,
                                 n_cross_validations = 2,
                                 preprocess = True)
    
    # AutoML action
    automl_step = AutoMLStep(name='automl_module_category_{}'.format(cat),
                             automl_config=automl_config,
                             inputs=[input_data],
                             outputs=[metrics_data, model_data],
                             allow_reuse=False)
    
    # Custom script action to register the model afterwards
    register_step = PythonScriptStep(name='register_category_{}'.format(cat),
                                     script_name='register.py',
                                     compute_target=compute_target,
                                     source_directory=project_folder,
                                     arguments=['--model_name', 'category_{}_model.pkl'.format(cat), '--model_path', model_data],
                                     inputs=[model_data],
                                     allow_reuse=False)
    
    # And chain them together so they run sequentially
    if current:
        automl_step.run_after(current)

    current = register_step

    steps.append(automl_step)
    steps.append(register_step)

pipeline = Pipeline(description='Generate recommendation models',
                    workspace=ws,
                    steps=steps)

pipeline.validate()

# Once published, we can invoke on demand via the SDK or via a REST endpoint
published_pipeline = pipeline.publish(name='category-based-propensity-pipeline')

In [None]:
# Automatically run our pipeline when the data changes
schedule = Schedule.create(workspace=ws,
                           name='category-based-propensity-schedule',
                           pipeline_id=published_pipeline.id, 
                           experiment_name='category-based-propensity-schedule',
                           datastore=ds,
                           path_on_datastore='boots',
                           wait_for_provisioning=True,
                           polling_interval=1,
                           description='Scheduled run of category-based-propensity')

In [None]:
# Or, run it on demand
published_pipeline.submit(ws, published_pipeline.name)