## Step 1: Import Packages and Declare Constants

In [1]:
import boto3
import sagemaker
import datetime as dt
import pandas as pd
from sagemaker.image_uris import retrieve
from sagemaker.sklearn.processing import SKLearnProcessor  
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role

sagemaker.config INFO - Fetched defaults config from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [2]:
default_bucket = "amazon-sagemaker-871652478984-us-east-2-bg1613sftbwgix"
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
sklearn_processor_version="0.23-1"
model_package_group_name="ChurnModelPackageGroup"
pipeline_name= "ChurnModelSMPipeline"
clarify_image = sagemaker.image_uris.retrieve(framework='sklearn',version=sklearn_processor_version,region=region)

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


## Step 2: Generate Baseline Dataset

In [3]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    ## Convert to datetime columns
    df["firstorder"]=pd.to_datetime(df["firstorder"],errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"],errors='coerce')
    ## Drop Rows with null values
    df = df.dropna()
    ## Create Column which gives the days between the last order and the first order
    df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
    ## Create Column which gives the days between when the customer record was created and the first order
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff']=(df['created']-df['firstorder']).dt.days
    ## Drop Columns
    df.drop(['custid','created','firstorder','lastorder'],axis=1,inplace=True)
    ## Apply one hot encoding on favday and city columns
    df = pd.get_dummies(df,prefix=['favday','city'],columns=['favday','city'])
    return df

In [4]:
baseline_data = preprocess_data("data/storedata_total.csv")
baseline_data.pop("retained")
baseline_sample = baseline_data.sample(frac=0.0002)
pd.DataFrame(baseline_sample).to_csv("data/baseline.csv", header=False, index=False)

## Step 3: Generate Batch Dataset

In [5]:
batch_data = preprocess_data("data/storedata_total.csv")
batch_data.pop("retained")
batch_sample = batch_data.sample(frac=0.2)
pd.DataFrame(batch_sample).to_csv("data/batch.csv", header=False, index=False)

## Step 4: Copy Data and Scripts to S3 Bucket

In [6]:
s3_client = boto3.resource('s3')
s3_client.Bucket(default_bucket).upload_file("data/storedata_total.csv", "data/storedata_total.csv")
s3_client.Bucket(default_bucket).upload_file("data/batch.csv", "data/batch/batch.csv")
s3_client.Bucket(default_bucket).upload_file("data/baseline.csv", "input/baseline/baseline.csv")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/preprocess.py", "input/code/preprocess.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/evaluate.py", "input/code/evaluate.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/generate_config.py", "input/code/generate_config.py")

## Step 5: Get the Pipeline Instance

In [7]:
bucket_name = "amazon-sagemaker-871652478984-us-east-2-bg1613sftbwgix"
region = "us-east-2"
role = "arn:aws:iam::871652478984:role/service-role/AmazonSageMakerAdminIAMExecutionRole_1"
default_bucket = bucket_name
sklearn_processor_version = "0.23-1"
pipeline_name = "ChurnModelPipeline"
model_package_group_name = "ChurnModelPackageGroup"

In [8]:
s3 = boto3.client("s3")
response = s3.list_objects_v2(Bucket=bucket_name)

if "Contents" in response:
    print(f"Contents of bucket '{bucket_name}':\n")
    for obj in response["Contents"]:
        print("•", obj["Key"])
else:
    print(f"Bucket '{bucket_name}' is empty.")

Contents of bucket 'amazon-sagemaker-871652478984-us-east-2-bg1613sftbwgix':

• Customer_Churn_Modeling (1).ipynb
• SageMaker_Pipelines_project.ipynb
• data/
• data/batch/batch.csv
• data/storedata_total.csv
• data/test/test.csv
• data/train/train.csv
• data/validation/validation.csv
• debugger/demo-smdebug-xgboost-churn-classificati-2026-01-25-02-44-54-400/debug-output/collections/000000000/worker_0_collections.json
• debugger/demo-smdebug-xgboost-churn-classificati-2026-01-25-02-44-54-400/debug-output/events/000000000000/000000000000_worker_0.tfevents
• debugger/demo-smdebug-xgboost-churn-classificati-2026-01-25-02-44-54-400/debug-output/events/000000000005/000000000005_worker_0.tfevents
• debugger/demo-smdebug-xgboost-churn-classificati-2026-01-25-02-44-54-400/debug-output/events/000000000010/000000000010_worker_0.tfevents
• debugger/demo-smdebug-xgboost-churn-classificati-2026-01-25-02-44-54-400/debug-output/events/000000000015/000000000015_worker_0.tfevents
• debugger/demo-smdebug

In [9]:
sagemaker_session = sagemaker.Session()
print(f"SageMaker version: {sagemaker.__version__}")
print(f"Region: {sagemaker_session.boto_region_name}")
role = get_execution_role()
print(f"IAM Role: {role}")

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
SageMaker version: 2.254.1
Region: us-east-2
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
IAM Role: arn:aws:iam::871652478984:role/service-role/AmazonSageMakerAdminIAMExecutionRole_1


In [10]:
image_uri = retrieve(
    framework="sklearn", 
    region=region, 
    version=sklearn_processor_version
)
print(f"Using image URI: {image_uri}")

Using image URI: 257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3


In [11]:
sklearn_processor = SKLearnProcessor(
    framework_version=sklearn_processor_version,
    instance_type="ml.t3.medium",  # Instance type set to medium
    instance_count=1,
    sagemaker_session=sagemaker_session,
    role=role
)

In [12]:
input_data = ParameterString(
    name="InputData",
    default_value=f"s3://{default_bucket}/data/storedata_total.csv"
)


In [13]:
step_process = ProcessingStep(
    name="ChurnModelProcess",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train", destination=f"s3://{default_bucket}/output/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation", destination=f"s3://{default_bucket}/output/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test", destination=f"s3://{default_bucket}/output/test"),
    ],
    code=f"s3://{default_bucket}/input/code/preprocess.py",
)


In [14]:
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[input_data],
    steps=[step_process],
    sagemaker_session=sagemaker_session
)

In [15]:
print(pipeline.definition())

{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "InputData", "Type": "String", "DefaultValue": "s3://amazon-sagemaker-871652478984-us-east-2-bg1613sftbwgix/data/storedata_total.csv"}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "ChurnModelProcess", "Type": "Processing", "Arguments": {"ProcessingResources": {"ClusterConfig": {"InstanceType": "ml.t3.medium", "InstanceCount": 1, "VolumeSizeInGB": 30}}, "AppSpecification": {"ImageUri": "257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3", "ContainerEntrypoint": ["python3", "/opt/ml/processing/input/code/preprocess.py"]}, "RoleArn": "arn:aws:iam::871652478984:role/service-role/AmazonSageMakerAdminIAMExecutionRole_1", "ProcessingInputs": [{"InputName": "input-1", "AppManaged": false, "S3Input": {"S3Uri": {"Get": "Parameters.InputData"}, "LocalPath": "/opt/ml/processing/input", "S3

## Step 6: Submit the pipeline to SageMaker and start execution

In [16]:
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-2:871652478984:pipeline/ChurnModelPipeline',
 'PipelineVersionId': 1,
 'ResponseMetadata': {'RequestId': '30ee4d71-0c21-42d7-b056-38822c128c77',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '30ee4d71-0c21-42d7-b056-38822c128c77',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-options': 'DENY',
   'content-security-policy': "frame-ancestors 'none'",
   'cache-control': 'no-cache, no-store, must-revalidate',
   'x-content-type-options': 'nosniff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '108',
   'date': 'Sun, 25 Jan 2026 22:31:39 GMT'},
  'RetryAttempts': 0}}

Start Pipeline Execution

In [17]:
execution=pipeline.start()

Now we describe execution instance and list the steps in the execution to find out more about the execution.

In [18]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-2:871652478984:pipeline/ChurnModelPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-2:871652478984:pipeline/ChurnModelPipeline/execution/c28x873h74yj',
 'PipelineExecutionDisplayName': 'execution-1769380303577',
 'PipelineExecutionStatus': 'Executing',
 'PipelineExperimentConfig': {'ExperimentName': 'ChurnModelPipeline',
  'TrialName': 'c28x873h74yj'},
 'CreationTime': datetime.datetime(2026, 1, 25, 22, 31, 43, 465000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2026, 1, 25, 22, 31, 43, 465000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-2:871652478984:user-profile/d-cjsqoflq1fg3/ea03f8b6-f3cf-4727-9e72-9f38077cc3ce',
  'UserProfileName': 'ea03f8b6-f3cf-4727-9e72-9f38077cc3ce',
  'DomainId': 'd-cjsqoflq1fg3',
  'IamIdentity': {'Arn': 'arn:aws:sts::871652478984:assumed-role/AmazonSageMakerAdminIAMExecutionRole_1/SageMaker',
   'PrincipalId': 'AROA4V4UJKAEF3K5U7LZS:SageMaker',
   'Sou

We can list the execution steps to check out the status and artifacts:

In [19]:
execution.list_steps()

[{'StepName': 'ChurnModelProcess',
  'StartTime': datetime.datetime(2026, 1, 25, 22, 31, 44, 515000, tzinfo=tzlocal()),
  'StepStatus': 'Executing',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-2:871652478984:processing-job/pipelines-c28x873h74yj-ChurnModelProcess-Wk3ToVZ8qE'}},
  'AttemptCount': 1}]