Step 1: Import Packages and Declare Constants

In [1]:
import boto3
import sagemaker
import datetime as dt
import pandas as pd
from pprint import pprint

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
#Replace this value with the S3 Bucket Created
default_bucket = "customer-churn-sm-pipeline-djw"

In [3]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
sklearn_processor_version="0.23-1"
model_package_group_name="ChurnModelPackageGroup"
pipeline_name= "ChurnModelSMPipeline"
clarify_image = sagemaker.image_uris.retrieve(framework='sklearn',version=sklearn_processor_version,region=region)



Step 2: Generate Baseline Dataset
Baseline Data will be used as part of SageMaker Clarify Step to generate SHAP Values

In [4]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    ## Convert to datetime columns
    df["firstorder"] = pd.to_datetime(df["firstorder"], errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"], errors='coerce')
    ## Drop Rows with null values
    df = df.dropna()
    ## Create Column which gives the days between the last order and the first order
    df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
    ## Create Column which gives the days between when the customer record was created and the first order
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff'] = (df['created'] - df['firstorder']).dt.days
    ## Drop Columns
    df.drop(['custid', 'created', 'firstorder', 'lastorder'], axis=1, inplace=True)
    ## Apply one hot encoding on favday and city columns
    df = pd.get_dummies(df,prefix=['favday', 'city'], columns=['favday', 'city'], dtype=int) # dtype=int sets output to 0/1 instead of bool
    return df

In [5]:
baseline_data = preprocess_data("data/storedata_total.csv")
baseline_data.pop("retained")
baseline_sample = baseline_data.sample(frac=0.0002)

pd.DataFrame(baseline_sample).to_csv("data/baseline.csv", header=False, index=False)
baseline_sample

Unnamed: 0,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,first_last_days_diff,created_first_days_diff,...,favday_Monday,favday_Saturday,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BLR,city_BOM,city_DEL,city_MAA
25216,49,2.040816,0.0,48.0,0.0,0,0,0,0,-7,...,0,1,0,0,0,0,1,0,0,0
11270,0,0.0,0.0,50.21,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
30276,22,50.0,13.636364,29.0,0.0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
19077,3,33.333333,0.0,40.02,0.0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
12824,35,62.857143,8.571429,60.09,0.018519,1,0,0,108,-42,...,0,0,0,0,1,0,0,0,0,1
21415,22,31.818182,9.090909,40.2,0.0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


Step 3: Generate Batch Dataset

In [6]:
batch_data = preprocess_data("data/storedata_total.csv")
batch_data.pop("retained")
batch_sample = batch_data.sample(frac=0.2)

pd.DataFrame(batch_sample).to_csv("data/batch.csv", header=False, index=False)


Step 4: Copy Data and Scripts to S3 Bucket

In [7]:
s3_client = boto3.resource('s3')
s3_client.Bucket(default_bucket).upload_file("data/storedata_total.csv", "data/storedata_total.csv")
s3_client.Bucket(default_bucket).upload_file("data/batch.csv", "data/batch/batch.csv")
s3_client.Bucket(default_bucket).upload_file("data/baseline.csv", "input/baseline/baseline.csv")
s3_client.Bucket(default_bucket).upload_file("pipelines/preprocess.py", "input/code/preprocess.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/evaluate.py", "input/code/evaluate.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/generate_config.py", "input/code/generate_config.py")

Step 5: Get the Pipeline Instance

In [8]:
from pipelines.pipeline import get_pipeline

pipeline = get_pipeline(
    region = region,
    role=role,
    default_bucket=default_bucket,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    custom_image_uri=clarify_image,
    sklearn_processor_version=sklearn_processor_version
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [9]:
pprint(pipeline.definition())



('{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": '
 '"ProcessingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, '
 '{"Name": "ProcessingInstanceCount", "Type": "Integer", "DefaultValue": 1}, '
 '{"Name": "TrainingInstanceType", "Type": "String", "DefaultValue": '
 '"ml.m5.large"}, {"Name": "InputData", "Type": "String", "DefaultValue": '
 '"s3://customer-churn-sm-pipeline-djw/data/storedata_total.csv"}, {"Name": '
 '"BatchData", "Type": "String", "DefaultValue": '
 '"s3://customer-churn-sm-pipeline-djw/data/batch/batch.csv"}], '
 '"PipelineExperimentConfig": {"ExperimentName": {"Get": '
 '"Execution.PipelineName"}, "TrialName": {"Get": '
 '"Execution.PipelineExecutionId"}}, "Steps": [{"Name": "ChurnModelProcess", '
 '"Type": "Processing", "Arguments": {"ProcessingResources": {"ClusterConfig": '
 '{"InstanceType": {"Get": "Parameters.ProcessingInstanceType"}, '
 '"InstanceCount": {"Get": "Parameters.ProcessingInstanceCount"}, '
 '"VolumeSizeInGB": 30

Step 5: Submit the pipeline to SageMaker and start execution

In [10]:
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-2:412381776958:pipeline/ChurnModelSMPipeline',
 'ResponseMetadata': {'RequestId': '84b85e95-8242-45d4-8872-9110f0c1879a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '84b85e95-8242-45d4-8872-9110f0c1879a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Sun, 01 Dec 2024 02:52:31 GMT'},
  'RetryAttempts': 0}}

In [11]:
# Start Pipeline Execution
execution = pipeline.start()

In [12]:
# Now we describe execution instance and list the steps in the execution to find out more about the execution.
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-2:412381776958:pipeline/ChurnModelSMPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-2:412381776958:pipeline/ChurnModelSMPipeline/execution/igb8cke9cg3n',
 'PipelineExecutionDisplayName': 'execution-1733021552520',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 12, 1, 2, 52, 32, 426000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 12, 1, 2, 52, 32, 426000, tzinfo=tzlocal()),
 'CreatedBy': {'IamIdentity': {'Arn': 'arn:aws:sts::412381776958:assumed-role/AmazonSageMaker-ExecutionRole-20241125T140701/SageMaker',
   'PrincipalId': 'AROAWAA66RQ7LYP5UQQYQ:SageMaker'}},
 'LastModifiedBy': {'IamIdentity': {'Arn': 'arn:aws:sts::412381776958:assumed-role/AmazonSageMaker-ExecutionRole-20241125T140701/SageMaker',
   'PrincipalId': 'AROAWAA66RQ7LYP5UQQYQ:SageMaker'}},
 'ResponseMetadata': {'RequestId': 'e8fbd693-5482-47a2-a250-b3de7c776038',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz

In [14]:
execution.list_steps()

[{'StepName': 'ClarifyProcessingStep',
  'StartTime': datetime.datetime(2024, 12, 1, 3, 3, 12, 970000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 12, 1, 3, 20, 48, 70000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-2:412381776958:processing-job/pipelines-igb8cke9cg3n-ClarifyProcessingSte-KqbLqLfxBq'}},
  'AttemptCount': 1},
 {'StepName': 'ChurnModelConfigFile',
  'StartTime': datetime.datetime(2024, 12, 1, 3, 0, 39, 465000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 12, 1, 3, 3, 12, 330000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-2:412381776958:processing-job/pipelines-igb8cke9cg3n-ChurnModelConfigFile-ofrdM8Nzmf'}},
  'AttemptCount': 1},
 {'StepName': 'RegisterChurnModel-RegisterModel',
  'StartTime': datetime.datetime(2024, 12, 1, 3, 0, 37, 490000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 12, 1, 3, 0