## Step 1: Import packages and declare constants

In [None]:
import boto3
import sagemaker
import datetime as dt
import pandas as pd
import io

# Replace this value with the S3 bucket created previously
bucket = "churn-prediction-sagemaker-demo"

region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
sklearn_processor_version="0.23-1"
model_package_group_name="ChurnModelPackageGroup"
pipeline_name= "ChurnModelSMPipeline"
clarify_image = sagemaker.image_uris.retrieve(framework='sklearn', version=sklearn_processor_version, region=region)

## Step 2: Generate baseline dataset

Baseline Data will be used as part of SageMaker Clarify Step to generate SHAP Values

In [None]:
def preprocess_data(df):
    """
    Preprocess the dataset
    """
    # Convert to datetime columns
    df["firstorder"] = pd.to_datetime(df["firstorder"], errors="coerce")
    df["lastorder"] = pd.to_datetime(df["lastorder"], errors="coerce")
    # Drop Rows with null values
    df = df.dropna()
    # Create column which gives the days between the first and last orders
    df["first_last_days_diff"] = (df["lastorder"] - df["firstorder"]).dt.days
    # Create column which gives the days between creation and first order
    df["created"] = pd.to_datetime(df["created"])
    df["created_first_days_diff"] = (df["created"] - df["firstorder"]).dt.days
    # Drop unused columns
    unused_columns = ["custid", "created", "firstorder", "lastorder"]
    df.drop(unused_columns, axis=1, inplace=True)
    # Apply one hot encoding on categorical columns
    cat_columns = ["favday", "city"]
    df = pd.get_dummies(df, prefix=cat_columns, columns=cat_columns, dtype=int)
    return df


# Read file in S3 bucket
file_key = "data/storedata_total.csv"
# Create an S3 client
s3_client = boto3.client("s3")
# Get the object from S3
obj = s3_client.get_object(Bucket=bucket, Key=file_key)
# Read the object content and load it into a pandas DataFrame
df = pd.read_csv(io.BytesIO(obj["Body"].read()))

baseline_data = preprocess_data(df)
baseline_data.pop("retained")
baseline_sample = baseline_data.sample(frac=0.0002)

pd.DataFrame(baseline_sample).to_csv("data/baseline.csv", header=False, index=False)

## Step 3: Generate batch dataset

In [None]:
batch_data = preprocess_data(df)
batch_data.pop("retained")
batch_sample = batch_data.sample(frac=0.2)
pd.DataFrame(batch_sample).to_csv("data/batch.csv", header=False, index=False)

## Step 4: Copy data to S3 bucket

In [None]:
s3_client = boto3.resource('s3')
s3_client.Bucket(bucket).upload_file("data/batch.csv", "data/batch/batch.csv")
s3_client.Bucket(bucket).upload_file("data/baseline.csv", "input/baseline/baseline.csv")

## Step 5: Get the pipeline instance

In [None]:
from pipeline import get_pipeline

pipeline = get_pipeline(
    region = region,
    role=role,
    default_bucket=bucket,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    custom_image_uri=clarify_image,
    sklearn_processor_version=sklearn_processor_version
)

In [None]:
pipeline.definition()

## Step 6: Submit the pipeline to SageMaker and start execution

In [None]:
# Submit the pipeline
pipeline.upsert(role_arn=role)

In [None]:
# Start pipeline execution
execution = pipeline.start()

In [None]:
# Describe execution instance
execution.describe()

In [None]:
# List the execution steps to check out the status and artifacts
execution.list_steps()