In [1]:
import boto3
import sagemaker
import datetime as dt
import pandas as pd


sagemaker.config INFO - Fetched defaults config from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [2]:
#Replace this value with the S3 Bucket Created
default_bucket = "rosa-churn-data-2025"

In [3]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
sklearn_processor_version="0.23-1"
model_package_group_name="ChurnModelPackageGroup"
pipeline_name= "ChurnModelSMPipeline"
clarify_image = sagemaker.image_uris.retrieve(framework='sklearn',version=sklearn_processor_version,region=region)

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [4]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    ## Convert to datetime columns
    df["firstorder"]=pd.to_datetime(df["firstorder"],errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"],errors='coerce')
    ## Drop Rows with null values
    df = df.dropna()
    ## Create Column which gives the days between the last order and the first order
    df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
    ## Create Column which gives the days between when the customer record was created and the first order
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff']=(df['created']-df['firstorder']).dt.days
    ## Drop Columns
    df.drop(['custid','created','firstorder','lastorder'],axis=1,inplace=True)
    ## Apply one hot encoding on favday and city columns
    df = pd.get_dummies(df,prefix=['favday','city'],columns=['favday','city'])
    return df

In [5]:

baseline_data = preprocess_data(f"s3://{default_bucket}/churn/churndata.csv")
baseline_data.pop("retained")
baseline_sample = baseline_data.sample(frac=0.0002)

In [6]:
pd.DataFrame(baseline_sample).to_csv("baseline.csv",header=False,index=False)

In [7]:
batch_data = preprocess_data("churndata.csv")
batch_data.pop("retained")
batch_sample = batch_data.sample(frac=0.2)

In [8]:
pd.DataFrame(batch_sample).to_csv("batch.csv",header=False,index=False)

In [22]:
s3_client = boto3.resource('s3')
s3_client.Bucket(default_bucket).upload_file("churndata.csv","churn/churndata.csv")
s3_client.Bucket(default_bucket).upload_file("batch.csv","data/batch/batch.csv")
s3_client.Bucket(default_bucket).upload_file("baseline.csv","data/baseline.csv")

In [23]:
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/Preprocess.py","pipelines/customerchurn/Preprocess.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/Evaluate.py","pipelines/customerchurn/Evaluate.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/Generate_config.py","pipelines/customerchurn/Generate_config.py")

In [24]:
import boto3

# Set your bucket name
bucket_name = "rosa-churn-data-2025"

# Create an S3 client
s3 = boto3.client("s3")

# List objects in the bucket
response = s3.list_objects_v2(Bucket=bucket_name)

# Check and print the structure
if "Contents" in response:
    print(f"Contents of bucket '{bucket_name}':\n")
    for obj in response["Contents"]:
        print("•", obj["Key"])
else:
    print(f"Bucket '{bucket_name}' is empty.")

Contents of bucket 'rosa-churn-data-2025':

• churn/churndata.csv
• clarify-output/bias/analysis_config.json
• clarify/bias-report/analysis_config.json
• clarify/clarify_bias_config.json
• data/baseline.csv
• data/batch.csv
• data/batch/batch.csv
• data/test/test.csv
• data/train/train.csv
• data/validation/validation.csv
• debugger/demo-smdebug-xgboost-churn-classificati-2025-04-03-18-41-47-373/debug-output/collections/000000000/worker_0_collections.json
• debugger/demo-smdebug-xgboost-churn-classificati-2025-04-03-18-41-47-373/debug-output/events/000000000000/000000000000_worker_0.tfevents
• debugger/demo-smdebug-xgboost-churn-classificati-2025-04-03-18-41-47-373/debug-output/events/000000000005/000000000005_worker_0.tfevents
• debugger/demo-smdebug-xgboost-churn-classificati-2025-04-03-18-41-47-373/debug-output/events/000000000010/000000000010_worker_0.tfevents
• debugger/demo-smdebug-xgboost-churn-classificati-2025-04-03-18-41-47-373/debug-output/events/000000000015/000000000015_wo

# Get the Pipeline Instance

In [12]:
import sagemaker
print(sagemaker.__version__)

2.227.0


In [27]:
import sagemaker

# Initialize a SageMaker session
sagemaker_session = sagemaker.Session()

# Get the current AWS region
region = sagemaker_session.boto_region_name
print(f"Region: {region}")

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
Region: us-east-2


In [26]:
import sagemaker

# Get the current AWS region
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
print(f"Region: {region}")

# Get the IAM role associated with your SageMaker execution
role = sagemaker.get_execution_role()
print(f"IAM Role: {role}")

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
Region: us-east-2
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
IAM Role: arn:aws:iam::779846812208:role/datazone_usr_role_3uvk2n13teqizt_58a7eoris5xezt


In [1]:
import sagemaker

# Define your parameters
region = "us-east-2"
role = "arn:aws:iam::779846812208:role/datazone_usr_role_3uvk2n13teqizt_58a7eoris5xezt"
default_bucket = "rosa-churn-data-2025"
model_package_group_name = "ChurnModelPackageGroup"
pipeline_name = "ChurnModelPipeline"
sklearn_processor_version = "0.23-1"

# Import the get_pipeline function from your Pipeline.py
from pipelines.customerchurn.Pipeline import get_pipeline

# Get the pipeline
pipeline = get_pipeline(
    region=region,
    role=role,
    default_bucket=default_bucket,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    sklearn_processor_version=sklearn_processor_version
)

# Print the pipeline definition to verify
print(pipeline.definition())

sagemaker.config INFO - Fetched defaults config from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.TrainingJob.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.TrainingJob.VpcConfig.SecurityGroupIds
{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "ProcessingInstanceType", "Type": "String", "DefaultValue": "ml.t3.medium

In [2]:
print(pipeline.definition())



{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "ProcessingInstanceType", "Type": "String", "DefaultValue": "ml.t3.medium"}, {"Name": "ProcessingInstanceCount", "Type": "Integer", "DefaultValue": 1}, {"Name": "TrainingInstanceType", "Type": "String", "DefaultValue": "ml.t3.medium"}, {"Name": "InputData", "Type": "String", "DefaultValue": "s3://rosa-churn-data-2025/churn/churndata.csv"}, {"Name": "BatchData", "Type": "String", "DefaultValue": "s3://rosa-churn-data-2025/data/batch/batch.csv"}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "ChurnPreprocessing", "Type": "Processing", "Arguments": {"ProcessingResources": {"ClusterConfig": {"InstanceType": "ml.t3.medium", "InstanceCount": 1, "VolumeSizeInGB": 30}}, "AppSpecification": {"ImageUri": "257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3", "ContainerEntrypoint": ["python3

# Step 5: Submit the pipeline to SageMaker and start execution

In [3]:
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-2:779846812208:pipeline/ChurnModelPipeline',
 'ResponseMetadata': {'RequestId': 'decb1e6b-1dff-4c63-b6ea-0fbd13d02c8d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'decb1e6b-1dff-4c63-b6ea-0fbd13d02c8d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '86',
   'date': 'Fri, 04 Apr 2025 20:29:34 GMT'},
  'RetryAttempts': 0}}

In [4]:
execution=pipeline.start()

In [5]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-2:779846812208:pipeline/ChurnModelPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-2:779846812208:pipeline/ChurnModelPipeline/execution/q0e3s7qfs8kj',
 'PipelineExecutionDisplayName': 'execution-1743798592391',
 'PipelineExecutionStatus': 'Executing',
 'PipelineExperimentConfig': {'ExperimentName': 'churnmodelpipeline',
  'TrialName': 'q0e3s7qfs8kj'},
 'CreationTime': datetime.datetime(2025, 4, 4, 20, 29, 52, 301000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 4, 4, 20, 29, 52, 301000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-2:779846812208:user-profile/d-lqa751s9mjw5/27be00e0-cbf8-48dd-81dc-f72c9d586d61',
  'UserProfileName': '27be00e0-cbf8-48dd-81dc-f72c9d586d61',
  'DomainId': 'd-lqa751s9mjw5',
  'IamIdentity': {'Arn': 'arn:aws:sts::779846812208:assumed-role/datazone_usr_role_3uvk2n13teqizt_58a7eoris5xezt/SageMaker',
   'PrincipalId': 'AROA3LET6AIYPT3VGWGQ4:SageMaker',


In [6]:
execution.list_steps()

[{'StepName': 'ChurnPreprocessing',
  'StartTime': datetime.datetime(2025, 4, 4, 20, 29, 53, 614000, tzinfo=tzlocal()),
  'StepStatus': 'Executing',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-2:779846812208:processing-job/pipelines-q0e3s7qfs8kj-ChurnPreprocessing-CmNChvr6vI'}},
  'AttemptCount': 1}]