## Step 1: Data preprocessing

Run this code to preprocess the dataset.

In [None]:
import pandas as pd
import boto3
import io

def preprocess_data(df):
    """
    Preprocess the dataset
    """
    # Convert to datetime columns
    df["firstorder"] = pd.to_datetime(df["firstorder"], errors="coerce")
    df["lastorder"] = pd.to_datetime(df["lastorder"], errors="coerce")
    # Drop Rows with null values
    df = df.dropna()
    # Create column which gives the days between the first and last orders
    df["first_last_days_diff"] = (df["lastorder"] - df["firstorder"]).dt.days
    # Create column which gives the days between creation and first order
    df["created"] = pd.to_datetime(df["created"])
    df["created_first_days_diff"] = (df["created"] - df["firstorder"]).dt.days
    # Drop unused columns
    unused_columns = ["custid", "created", "firstorder", "lastorder"]
    df.drop(unused_columns, axis=1, inplace=True)
    # Apply one hot encoding on categorical columns
    cat_columns = ["favday", "city"]
    df = pd.get_dummies(df, prefix=cat_columns, columns=cat_columns, dtype=int)
    return df


# Define the S3 bucket and file key
bucket = "churn-prediction-sagemaker-demo"
file_key = "data/storedata_total.csv"
# Create an S3 client
s3_client = boto3.client("s3")
# Get the object from S3
obj = s3_client.get_object(Bucket=bucket, Key=file_key)
# Read the object content and load it into a pandas DataFrame
df = pd.read_csv(io.BytesIO(obj["Body"].read()))
# Display the first few rows of the DataFrame
print(df.head())
print(df.shape)

# Preprocess the dataset
storedata = preprocess_data(df)
storedata

## Step 2: Data loading

Run this code to split the data and upload the splits to S3.

In [None]:
import numpy as np
from io import StringIO

def split_dataset(df):
    y = df.pop("retained")
    X_pre = df
    y_pre = y.to_numpy().reshape(len(y), 1)
    feature_names = list(X_pre.columns)
    X = np.concatenate((y_pre, X_pre), axis=1)
    np.random.shuffle(X)
    train, validation, test = np.split(X, [int(.7*len(X)), int(.85*len(X))])
    return feature_names, train, validation, test


# Copy the dataset for easier debugging
df = storedata.copy()
# Split dataset
feature_names, train, validation, test = split_dataset(df)

# Save datasets in Amazon S3

csv_buffer = StringIO()
pd.DataFrame(train).to_csv(csv_buffer, header=False, index=False)
s3_resource = boto3.resource("s3")
s3_resource.Object(bucket, "data/train/train.csv").put(Body=csv_buffer.getvalue())

csv_buffer = StringIO()
pd.DataFrame(validation).to_csv(csv_buffer, header=False, index=False)
s3_resource = boto3.resource("s3")
s3_resource.Object(bucket, "data/validation/validation.csv").put(Body=csv_buffer.getvalue())

csv_buffer = StringIO()
pd.DataFrame(test).to_csv(csv_buffer, header=False, index=False)
s3_resource = boto3.resource("s3")
s3_resource.Object(bucket, "data/test/test.csv").put(Body=csv_buffer.getvalue())

## Step 3: Hyperparameter optimization

Run this code to train, tune, and find the best candidate model.

In [None]:
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sagemaker_session.boto_region_name


# Training and Validation Input for SageMaker Training job
s3_input_train = TrainingInput(
    s3_data=f"s3://{bucket}/data/train/", content_type="csv")
s3_input_validation = TrainingInput(
    s3_data=f"s3://{bucket}/data/validation/", content_type="csv")

# Hyperparameter used
fixed_hyperparameters = {
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "num_round": "100",
    "rate_drop": "0.3",
    "tweedie_variance_power": "1.4"
}

# Use the built-in SageMaker algorithm

sess = sagemaker.Session()
container = sagemaker.image_uris.retrieve("xgboost", region, "0.90-2")

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    hyperparameters=fixed_hyperparameters,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sagemaker_session
)

hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 1),
    "min_child_weight": ContinuousParameter(1, 10),
    "alpha": ContinuousParameter(0, 2),
    "max_depth": IntegerParameter(1, 10),
}
objective_metric_name = "validation:auc"
tuner = HyperparameterTuner(
    estimator, objective_metric_name,
    hyperparameter_ranges, max_jobs=10, max_parallel_jobs=2)

# Tune
tuner.fit({
    "train": s3_input_train,
    "validation": s3_input_validation
    }, include_cls_metadata=False)

# Explore the best model generated
tuning_job_result = boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" %job_count)
# 10 training jobs have completed

# Get the best training job

from pprint import pprint
if tuning_job_result.get("BestTrainingJob", None):
    print("Best Model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")

## Step 4: Fit best model with SageMaker Debugger

This fits the best model and attaches a SageMaker Debugger hook to configure SHAP for feature explainability.

In [None]:
from sagemaker.debugger import DebuggerHookConfig, CollectionConfig
from sagemaker.debugger import rule_configs, Rule

best_hyperparameters = tuning_job_result["BestTrainingJob"]["TunedHyperParameters"]
hyperparameters = {**fixed_hyperparameters,**best_hyperparameters}
save_interval = 5
base_job_name = "demo-smdebug-xgboost-churn-classification"

container = sagemaker.image_uris.retrieve("xgboost", region, "0.90-2")

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    base_job_name=base_job_name,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sess,
    hyperparameters=hyperparameters,
    max_run=1800,
    debugger_hook_config = DebuggerHookConfig(
        s3_output_path=f"s3://{bucket}/debugger/",
        collection_configs=[
            CollectionConfig(
                name="metrics",
                parameters={
                    "save_interval": "5"
                }),
            CollectionConfig(
                name="feature_importance", parameters={"save_interval": "5"}
            ),
            CollectionConfig(name="full_shap", parameters={"save_interval": "5"}),
            CollectionConfig(name="average_shap", parameters={"save_interval": "5"}),
        ]
    ),
    rules=[
        Rule.sagemaker(
            rule_configs.loss_not_decreasing(),
            rule_parameters={
                "collection_names": "metrics",
                "num_steps": "10",
            },
        ),
    ]
)

estimator.fit(
        {"train":s3_input_train,"validation":s3_input_validation}
    )

## Step 5: Analyze Debugger output

Here we analyze debugger output with some visualizations.

First we have to install [sagemaker-debugger](https://github.com/awslabs/sagemaker-debugger) (SMdebug)

In [None]:
!pip install smdebug

Then we have to collect the debugger output.

In [None]:
from smdebug.trials import create_trial

s3_output_path = estimator.latest_job_debugger_artifacts_path()
trial = create_trial(s3_output_path)

### AUC for training and validation sets during training

In [None]:
import re
from itertools import islice
import matplotlib.pyplot as plt

def get_data(trial, tname):
    """
    For the given tensor name, walks though all the iterations
    for which you have data and fetches the values.
    Returns the set of steps and the values.
    """
    tensor = trial.tensor(tname)
    steps = tensor.steps()
    vals = [tensor.value(s) for s in steps]
    return steps, vals


def match_tensor_name_with_feature_name(tensor_name, feature_names=feature_names):
    feature_tag = tensor_name.split("/")
    for ifeat, feature_name in enumerate(feature_names):
        if feature_tag[-1] == "f{}".format(str(ifeat)):
            return feature_name
    return tensor_name

def get_data(trial, tname):
    """
    For the given tensor name, walks though all the iterations
    for which you have data and fetches the values.
    Returns the set of steps and the values.
    """
    tensor = trial.tensor(tname)
    steps = tensor.steps()
    vals = [tensor.value(s) for s in steps]
    return steps, vals


def match_tensor_name_with_feature_name(tensor_name, feature_names=feature_names):
    feature_tag = tensor_name.split("/")
    for ifeat, feature_name in enumerate(feature_names):
        if feature_tag[-1] == "f{}".format(str(ifeat)):
            return feature_name
    return tensor_name


def plot_collection(trial, collection_name, regex=".*", max_tensors = 100, figsize=(8, 6)):
    """
    Takes a `trial` and a collection name, and
    plots all tensors that match the given regex.
    """
    fig, ax = plt.subplots(figsize=figsize)
    tensors = trial.collection(collection_name).tensor_names
    matched_tensors = [t for t in tensors if re.match(regex, t)]
    print(matched_tensors)
    for tensor_name in islice(matched_tensors, max_tensors):
        steps, data = get_data(trial, tensor_name)
        ax.plot(steps, data, label=match_tensor_name_with_feature_name(tensor_name))

    ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    ax.set_xlabel("Iteration")

plot_collection(trial, "metrics")

In [None]:
### Feature importance

In [None]:
def plot_collection_all_features(trial, collection_name, regex=".*", max_tensors = 100, figsize=(8, 6)):
    """
    Redefine plot_collection() to get all tensors for
    feature importance with trial.tensor_names()
    """
    fig, ax = plt.subplots(figsize=figsize)
    # This is what we changed 
    tensors = trial.tensor_names()
    matched_tensors = [t for t in tensors if re.match(regex, t)]
    print(matched_tensors)
    for tensor_name in islice(matched_tensors, max_tensors):
        steps, data = get_data(trial, tensor_name)
        ax.plot(steps, data, label=match_tensor_name_with_feature_name(tensor_name))

    ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    ax.set_xlabel("Iteration")

def plot_feature_importance(trial, importance_type="weight"):
    SUPPORTED_IMPORTANCE_TYPES = ["weight", "gain", "cover", "total_gain", "total_cover"]
    if importance_type not in SUPPORTED_IMPORTANCE_TYPES:
        raise ValueError(f"{importance_type} is not one of the supported importance types.")
    plot_collection_all_features(trial, "feature_importance", regex=f"feature_importance/{importance_type}/.*")


plot_feature_importance(trial, importance_type="cover")

### SHAP

In [None]:
!pip install shap

import shap

shap_values = trial.tensor("full_shap/f0").value(trial.last_complete_step)
shap_no_base = shap_values[:, :-1]
train_shap = pd.DataFrame(train[:, 1:], columns=feature_names)
shap.summary_plot(shap_no_base, train_shap)

In [None]:
trial.tensor_names()

In [None]:
print(MAX_PLOTS)

In [None]:
t

In [None]:
trial.collection("feature_importance").tensor_names

In [None]:
?trial