In [1]:
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import TuningStep, ProcessingStep, CreateModelStep
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.model import Model
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.model_metrics import MetricsSource, ModelMetrics

# SageMaker session setup
role = 'arn:aws:iam::339712893183:role/SagemakerNotebookRole'
sagemaker_session = sagemaker.Session()
pipeline_session = PipelineSession()

# Define the S3 bucket and prefix
bucket = 's3-churn-predictor'
prefix = 'output'

# Preprocessing step
preprocessing_script_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve(framework='sklearn', region=sagemaker_session.boto_region_name, version='0.23-1'),
    command=['python3'],
    instance_type='ml.m5.large',
    instance_count=1,
    role=role,
    sagemaker_session=sagemaker_session
)

preprocessing_step = ProcessingStep(
    name='PreprocessingStep',
    processor=preprocessing_script_processor,
    inputs=[
        ProcessingInput(source='s3://s3-churn-predictor/data/data.csv', destination='/opt/ml/processing/input')
    ],
    outputs=[
        ProcessingOutput(output_name="train_15D", source='/opt/ml/processing/train_15D', destination=f's3://{bucket}/data/train_15D'),
        ProcessingOutput(output_name="test_15D", source='/opt/ml/processing/test_15D', destination=f's3://{bucket}/data/test_15D'),
        ProcessingOutput(output_name="train_30D", source='/opt/ml/processing/train_30D', destination=f's3://{bucket}/data/train_30D'),
        ProcessingOutput(output_name="test_30D", source='/opt/ml/processing/test_30D', destination=f's3://{bucket}/data/test_30D'),
        ProcessingOutput(output_name="train_45D", source='/opt/ml/processing/train_45D', destination=f's3://{bucket}/data/train_45D'),
        ProcessingOutput(output_name="test_45D", source='/opt/ml/processing/test_45D', destination=f's3://{bucket}/data/test_45D')
    ],
    code='scripts/preprocessing.py'
)

# Define hyperparameter ranges
hyperparameter_ranges = {
    'lr': ContinuousParameter(0.0001, 0.1),
    'batch-size': IntegerParameter(32, 128)
}

steps = []
for window_size in [15, 30, 45]:
    model_path = f"s3://{bucket}/output_{window_size}D"

    pytorch_estimator = PyTorch(
        entry_point='train.py',
        source_dir='scripts',
        role=role,
        instance_count=1,
        instance_type='ml.m5.large',
        framework_version='1.8.1',
        py_version='py3',
        sagemaker_session=sagemaker_session,
        hyperparameters={
            'epochs': 10,
            'bucket-name': bucket,
            'window-size': window_size,
            'train-data-path': f's3://{bucket}/data/train_{window_size}D/train.csv',
            'test-data-path': f's3://{bucket}/data/test_{window_size}D/test.csv',
            'lr': 0.001,  # Default learning rate
            'batch-size': 64  # Default batch size
        },
        output_path=model_path
    )


    # Create a HyperparameterTuner object
    tuner = HyperparameterTuner(
        estimator=pytorch_estimator,
        objective_metric_name='validation:auc',
        objective_type='Maximize',
        metric_definitions=[
            {'Name': 'validation:auc', 'Regex': 'Validation AUC: ([0-9\\.]+)'}
        ],
        hyperparameter_ranges=hyperparameter_ranges,
        max_jobs=10,
        max_parallel_jobs=5
    )

    # Define the tuning step using the tuner
    tuning_step = TuningStep(
        name=f'ChurnPredictorTuning_{window_size}D',
        tuner=tuner,
        inputs={
                "train": TrainingInput(
                    s3_data=preprocessing_step.properties.ProcessingOutputConfig.Outputs[
                        f"train_{window_size}D"
                    ].S3Output.S3Uri,
                    content_type="text/csv",
                ),
                "test": TrainingInput(
                    s3_data=preprocessing_step.properties.ProcessingOutputConfig.Outputs[
                        f"test_{window_size}D"
                    ].S3Output.S3Uri,
                    content_type="text/csv",
                )
        }
    )

    # Get the top model's S3 URI using tuning_step
    top_model_s3_uri = tuning_step.get_top_model_s3_uri(top_k=0, s3_bucket=bucket, prefix=prefix)

    # Define the EvaluationStep
    evaluation_script_processor = ScriptProcessor(
        image_uri=sagemaker.image_uris.retrieve(framework='sklearn', region=sagemaker_session.boto_region_name, version='0.23-1'),
        command=['python3'],
        instance_type='ml.m5.large',
        instance_count=1,
        role=role,
        sagemaker_session=pipeline_session
    )

    evaluation_report = PropertyFile(
        name=f"ChurnEvaluationReport_{window_size}D",
        output_name="evaluation",
        path="evaluation.json",
    )

    # Get the S3 URI of the top model
    top_model_s3_uri = tuning_step.get_top_model_s3_uri(top_k=0, s3_bucket=bucket, prefix=f'output_{window_size}D')

    # Define the EvaluationStep
    evaluation_step = ProcessingStep(
        name=f'EvaluationStep_{window_size}D',
        processor=evaluation_script_processor,
        inputs=[
            ProcessingInput(source=top_model_s3_uri, destination='/opt/ml/processing/models'),
            ProcessingInput(source=preprocessing_step.properties.ProcessingOutputConfig.Outputs[
                        f"test_{window_size}D"
                    ].S3Output.S3Uri, destination='/opt/ml/processing/test')
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source='/opt/ml/processing/evaluation', destination=f's3://{bucket}/evaluation_{window_size}D')
        ],
        code='scripts/evaluate.py',
        job_arguments=[
            '--window-size', str(window_size)  # Pass the window size to the evaluation script
        ],
        property_files=[evaluation_report]
    )

    # Define the Model object using the best model from TuningStep
    model = Model(
        image_uri=pytorch_estimator.training_image_uri(),
        model_data=top_model_s3_uri,
        role=role,
        sagemaker_session=pipeline_session
    )

    # Define the CreateModelStep using the top model's S3 URI
    create_model_step = CreateModelStep(
        name=f'CreateModelStep_{window_size}D',
        model=model,
        inputs=sagemaker.inputs.CreateModelInput(instance_type='ml.m5.large')
    )

    # Define ModelMetrics for registration
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri=f"s3://{bucket}/evaluation_{window_size}D/evaluation.json",
            content_type="application/json"
        )
    )

    # Define model_package_group_name
    model_package_group_name = "ChurnModelPackageGroup"

    # Define RegisterModelStep using estimator and model_data
    register_model_step = RegisterModel(
        name=f"RegisterChurnModel_{window_size}D",
        estimator=pytorch_estimator,
        model_data=top_model_s3_uri,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_metrics=model_metrics
    )

    # Define the ConditionStep
    condition_step = ConditionStep(
        name=f"CheckAUC_{window_size}D",
        conditions=[
            ConditionGreaterThanOrEqualTo(
                left=JsonGet(
                    step_name=evaluation_step.name,
                    property_file=evaluation_report,
                    json_path="classification_metrics.auc_score.value"
                ),
                right=0.50
            )
        ],
        if_steps=[create_model_step, register_model_step],
        else_steps=[]
    )

    steps.extend([tuning_step, evaluation_step, condition_step])


# Define the pipeline
pipeline = Pipeline(
    name='ChurnPredictorPipeline',
    parameters=[],
    steps=[preprocessing_step] + steps,
    sagemaker_session=pipeline_session
)

# Create and start the pipeline
pipeline.create(role_arn=role)
pipeline.start()


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving i

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:339712893183:pipeline/ChurnPredictorPipeline/execution/xlv3o79f967n', sagemaker_session=<sagemaker.workflow.pipeline_context.PipelineSession object at 0x7fd4f41d3010>)

In [32]:
import boto3

client = boto3.client('sagemaker')

pipeline_name = 'ChurnPredictorPipeline'

# Delete the existing pipeline
client.delete_pipeline(PipelineName=pipeline_name)


{'PipelineArn': 'arn:aws:sagemaker:us-east-1:339712893183:pipeline/ChurnPredictorPipeline',
 'ResponseMetadata': {'RequestId': '2bc78de7-843d-4271-b5a9-f32fa0127e3a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2bc78de7-843d-4271-b5a9-f32fa0127e3a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '90',
   'date': 'Wed, 24 Jul 2024 20:50:07 GMT'},
  'RetryAttempts': 0}}

In [62]:
import sagemaker
from sagemaker.xgboost import XGBoost
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import TuningStep, ProcessingStep, CreateModelStep
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.model import Model
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.model_metrics import MetricsSource, ModelMetrics

# SageMaker session setup
role = 'arn:aws:iam::339712893183:role/SagemakerNotebookRole'
sagemaker_session = sagemaker.Session()
pipeline_session = PipelineSession()

# Define the S3 bucket and prefix
bucket = 's3-churn-predictor'
prefix = 'output'

# Preprocessing step
preprocessing_script_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve(framework='sklearn', region=sagemaker_session.boto_region_name, version='0.23-1'),
    command=['python3'],
    instance_type='ml.m5.large',
    instance_count=1,
    role=role,
    sagemaker_session=sagemaker_session
)

preprocessing_step = ProcessingStep(
    name='PreprocessingStep',
    processor=preprocessing_script_processor,
    inputs=[
        ProcessingInput(source='s3://s3-churn-predictor/data/data.csv', destination='/opt/ml/processing/input')
    ],
    outputs=[
        ProcessingOutput(output_name="train_15D", source='/opt/ml/processing/train_15D', destination=f's3://{bucket}/data/train_15D'),
        ProcessingOutput(output_name="test_15D", source='/opt/ml/processing/test_15D', destination=f's3://{bucket}/data/test_15D'),
        ProcessingOutput(output_name="train_30D", source='/opt/ml/processing/train_30D', destination=f's3://{bucket}/data/train_30D'),
        ProcessingOutput(output_name="test_30D", source='/opt/ml/processing/test_30D', destination=f's3://{bucket}/data/test_30D'),
        ProcessingOutput(output_name="train_45D", source='/opt/ml/processing/train_45D', destination=f's3://{bucket}/data/train_45D'),
        ProcessingOutput(output_name="test_45D", source='/opt/ml/processing/test_45D', destination=f's3://{bucket}/data/test_45D')
    ],
    code='scripts/preprocessing.py'
)

# Define hyperparameter ranges for XGBoost
hyperparameter_ranges = {
    'eta': ContinuousParameter(0.01, 0.2),
    'max_depth': IntegerParameter(3, 10),
    'min_child_weight': ContinuousParameter(1, 10),
    'subsample': ContinuousParameter(0.5, 1),
    'colsample_bytree': ContinuousParameter(0.5, 1)
}

steps = []
for window_size in [15, 30, 45]:
    model_path = f"s3://{bucket}/output_{window_size}D"

    xgboost_estimator = XGBoost(
        entry_point='train3.py',
        source_dir='scripts',
        role=role,
        instance_count=1,
        instance_type='ml.m5.large',
        framework_version='1.5-1',
        py_version='py3',
        sagemaker_session=sagemaker_session,
        hyperparameters={
            'num_round': 100,
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'bucket-name': bucket,
            'window-size': window_size,
            'train-data-path': f's3://{bucket}/data/train_{window_size}D/train.csv',
            'test-data-path': f's3://{bucket}/data/test_{window_size}D/test.csv'
        },
        output_path=model_path
    )

    # Create a HyperparameterTuner object
    tuner = HyperparameterTuner(
        estimator=xgboost_estimator,
        objective_metric_name='validation:auc',
        objective_type='Maximize',
        metric_definitions=[
            {'Name': 'validation:auc', 'Regex': 'Validation AUC: ([0-9\\.]+)'}
        ],
        hyperparameter_ranges=hyperparameter_ranges,
        max_jobs=20,
        max_parallel_jobs=8
    )

    # Define the tuning step using the tuner
    tuning_step = TuningStep(
        name=f'ChurnPredictorTuning_{window_size}D',
        tuner=tuner,
        inputs={
            "train": TrainingInput(
                s3_data=f's3://{bucket}/data/train_{window_size}D/train.csv',
                content_type="text/csv",
            ),
            "test": TrainingInput(
                s3_data=f's3://{bucket}/data/train_{window_size}D/test.csv',
                content_type="text/csv",
            )
        }
    )

    steps.append(tuning_step)

# Define the pipeline
pipeline = Pipeline(
    name='ChurnPredictorPipeline2',
    parameters=[],
    steps= steps,
    sagemaker_session=pipeline_session
)

# Create and start the pipeline
pipeline.create(role_arn=role)
pipeline.start()


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.large.
INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.large.
INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.large.


_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:339712893183:pipeline/ChurnPredictorPipeline2/execution/ppjhypx3yavl', sagemaker_session=<sagemaker.workflow.pipeline_context.PipelineSession object at 0x7fe286f4e4d0>)

In [60]:
import boto3

client = boto3.client('sagemaker')

pipeline_name = 'ChurnPredictorPipeline2'

# Delete the existing pipeline
client.delete_pipeline(PipelineName=pipeline_name)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:339712893183:pipeline/ChurnPredictorPipeline2',
 'ResponseMetadata': {'RequestId': 'b2aebb8a-6b45-41a1-851d-94da8577b835',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b2aebb8a-6b45-41a1-851d-94da8577b835',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '91',
   'date': 'Thu, 25 Jul 2024 01:24:12 GMT'},
  'RetryAttempts': 0}}