## Data preprocessing

Run this code to preprocess the dataset.

In [None]:
import pandas as pd
import boto3
import io

def preprocess_data(df):
    """
    Preprocess the dataset
    """
    # Convert to datetime columns
    df["firstorder"] = pd.to_datetime(df["firstorder"], errors="coerce")
    df["lastorder"] = pd.to_datetime(df["lastorder"], errors="coerce")
    # Drop Rows with null values
    df = df.dropna()
    # Create column which gives the days between the first and last orders
    df["first_last_days_diff"] = (df["lastorder"] - df["firstorder"]).dt.days
    # Create column which gives the days between creation and first order
    df["created"] = pd.to_datetime(df["created"])
    df["created_first_days_diff"] = (df["created"] - df["firstorder"]).dt.days
    # Drop unused columns
    unused_columns = ["custid", "created", "firstorder", "lastorder"]
    df.drop(unused_columns, axis=1, inplace=True)
    # Apply one hot encoding on categorical columns
    cat_columns = ['favday', 'city']
    df = pd.get_dummies(df, prefix=cat_columns, columns=cat_columns)
    return df


# Define the S3 bucket and file key
bucket = "churn-prediction-sagemaker-demo"
file_key = "data/storedata_total.csv"
# Create an S3 client
s3_client = boto3.client("s3")
# Get the object from S3
obj = s3_client.get_object(Bucket=bucket, Key=file_key)
# Read the object content and load it into a pandas DataFrame
df = pd.read_csv(io.BytesIO(obj["Body"].read()))
# Display the first few rows of the DataFrame
print(df.head())
print(df.shape)

# Preprocess the dataset
storedata = preprocess_data(df)
storedata

## Data loading

Run this code to split the data and upload the splits to S3.

In [None]:
import numpy as np
from io import StringIO

def split_dataset(df):
    y = df.pop("retained")
    X_pre = df
    y_pre = y.to_numpy().reshape(len(y), 1)
    feature_names = list(X_pre.columns)
    X = np.concatenate((y_pre, X_pre), axis=1)
    np.random.shuffle(X)
    train, validation, test = np.split(X, [int(.7*len(X)), int(.85*len(X))])
    return feature_names, train, validation, test


# Copy the dataset for easier debugging
df = storedata.copy()
# Split dataset
feature_names, train, validation, test = split_dataset(df)

# Save datasets in Amazon S3

csv_buffer = StringIO()
pd.DataFrame(train).to_csv(csv_buffer)
s3_resource = boto3.resource("s3")
s3_resource.Object(bucket, "data/train/train.csv").put(Body=csv_buffer.getvalue())

csv_buffer = StringIO()
pd.DataFrame(validation).to_csv(csv_buffer)
s3_resource = boto3.resource("s3")
s3_resource.Object(bucket, "data/validation/validation.csv").put(Body=csv_buffer.getvalue())

csv_buffer = StringIO()
pd.DataFrame(test).to_csv(csv_buffer)
s3_resource = boto3.resource("s3")
s3_resource.Object(bucket, "data/test/test.csv").put(Body=csv_buffer.getvalue())

## Model training

Run this code to train, tune, and find the best candidate model.

In [None]:
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sagemaker_session.boto_region_name


# Training and Validation Input for SageMaker Training job
s3_input_train = TrainingInput(
    s3_data=f"s3://{bucket}/data/train/",content_type="csv")
s3_input_validation = TrainingInput(
    s3_data=f"s3://{bucket}/data/validation/",content_type="csv")

# Hyperparameter used
fixed_hyperparameters = {
    "eval_metric":"auc",
    "objective":"binary:logistic",
    "num_round":"100",
    "rate_drop":"0.3",
    "tweedie_variance_power":"1.4"
}

# Use the built-in SageMaker algorithm

sess = sagemaker.Session()
container = sagemaker.image_uris.retrieve("xgboost",region,"0.90-2")

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    hyperparameters=fixed_hyperparameters,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sagemaker_session
)

hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 1),
    "min_child_weight": ContinuousParameter(1, 10),
    "alpha": ContinuousParameter(0, 2),
    "max_depth": IntegerParameter(1, 10),
}
objective_metric_name = "validation:auc"
tuner = HyperparameterTuner(
estimator, objective_metric_name,
hyperparameter_ranges,max_jobs=10,max_parallel_jobs=2)

# Tune
tuner.fit({
    "train":s3_input_train,
    "validation":s3_input_validation
    },include_cls_metadata=False)

## Explore the best model generated
tuning_job_result = boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" %job_count)
## 10 training jobs have completed

## Get the best training job

from pprint import pprint
if tuning_job_result.get("BestTrainingJob",None):
    print("Best Model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")