# Load Data
We already have the raw data in S3 from our previous pipeline renditions.

# Define a Pipeline
In addition to previous pipeline additions, we will now incorporate a custom training script which will allow us to train a SKLearn Decision Tree Classifier Model.

The main changes from previous pipelines are summarised below:
* Adapted quality gate; ValidationAccuracy parameter will be used for our FailStep to determine whether the model has yielded good enough results to warrant its registration to the model registry.
* Instead of using Sagemaker's in-built XGBoost algorithm, we will use the SKLearn estimator with a custom training script.


In [12]:
import sagemaker

from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import PipelineSession

from sagemaker.workflow.steps import ProcessingStep
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import (
    ProcessingInput,
    ProcessingOutput
)

from sagemaker.inputs import TrainingInput

from sagemaker.workflow.steps import TrainingStep
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.condition_step import ConditionStep

from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo, ConditionGreaterThanOrEqualTo
from sagemaker.workflow.functions import Join

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
    ParameterBoolean
)

Define pipeline parameters

In [13]:
# high-level
session = sagemaker.Session()
role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()
bucket = 'sagemaker-fraud-detection-ml'

# pipeline parameters
tree_max_depth = ParameterInteger(
    name='TreeMaxDepth',
    default_value=5
)
min_validation_accuracy = ParameterFloat(
    name='MinValidationAccuracyToRegisterModel',
    default_value=0.85
)

Define processing step

In [14]:
processor = SKLearnProcessor(
    framework_version='1.0-1',
    role=role,
    instance_type='ml.m5.large',
    instance_count=1                          
)

preprocessing_step = ProcessingStep(
    name='FraudDetectionProcess',
    processor=processor,
    code='data_processing.py',
    inputs=[
    
    # define where the processor needs to look to find raw data 
        ProcessingInput(
            input_name='raw-data',
            source=f's3://{bucket}/raw_data.csv',
            destination='/opt/ml/processing/input/data/',
            s3_data_distribution_type='ShardedByS3Key'
        )
    ],

    # define where the processor needs to look to find processed data to upload to s3
    outputs=[
        ProcessingOutput(
            output_name='train',
            source='/opt/ml/processing/output/train',
            s3_upload_mode='EndOfJob'
        ),
        ProcessingOutput(
            output_name='validation',
            source='/opt/ml/processing/output/validation',
            s3_upload_mode='EndOfJob'
        ),
        ProcessingOutput(
            output_name='test',
            source='/opt/ml/processing/output/test',
            s3_upload_mode='EndOfJob'
        ),
    ],
    job_arguments=['--input-data', '/opt/ml/processing/input/data/'],
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


Define processed input data & estimator to train

In [15]:
# training inputs are dynamically linked to s3 outputs from the preprocessing step

s3_input_train = TrainingInput(
    s3_data=Join(
        on='/', 
        values=[
            preprocessing_step.properties.ProcessingOutputConfig.Outputs['train'].S3Output.S3Uri,
            'train.csv'
        ]
    ),
    content_type='csv'
)
s3_input_validate = TrainingInput(
    s3_data=Join(
        on='/',
        values=[
            preprocessing_step.properties.ProcessingOutputConfig.Outputs['validation'].S3Output.S3Uri,
            'validation.csv'
        ]
    ),
    content_type='csv'
)

In [16]:
from sagemaker.sklearn.estimator import SKLearn

hyperparameters={
    'max-depth': tree_max_depth,
    'min-samples-split': 2
}

# Regex tells Sagemaker what to look out for in the training logs.
# whatever this matches, this will be reported as metrics in the Sagemaker Pipeline
metric_definitions=metric_definitions = [
     {'Name': 'training:accuracy', 'Regex': 'train_acc: ([0-9.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_acc: ([0-9.]+)'},
]

estimator = SKLearn(
    entry_point='train.py',
    source_dir='.',
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    py_version='py3',
    framework_version='1.0-1',
    hyperparameters=hyperparameters,
    metric_definitions=metric_definitions,
    input_mode='File',
    sagemaker_session=pipeline_session
)

In [17]:
training_step = TrainingStep(
    name='TrainingStep',
    step_args=estimator.fit(
        inputs={
            'train': s3_input_train,
            'validation': s3_input_validate
        },
    )
)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.


Register model step (if conditional step is a success)

In [18]:
from sagemaker.model import Model
from sagemaker.workflow.model_step import ModelStep

model = Model(
    image_uri=estimator.image_uri,
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    entry_point = estimator.entry_point, # this is necessary because the train.py entry_point contains functions to be executed during inference
    sagemaker_session=pipeline_session,
    role=role,
)

register_args = model.register(
    content_types=['text/csv'],
    response_types=['text/csv'],
    inference_instances=['ml.m5.large'],
    transform_instances=['ml.m5.large'],
    model_package_group_name='fraud-detection-model-group',
    approval_status='PendingManualApproval'
)

register_model_step = ModelStep(
    name='FraudDetectionRegisterModel',
    step_args=register_args,
)

Fail the pipeline in the event the validation accuracy is too low

In [19]:
fail_step = FailStep(
    name='FailStep',
    error_message=Join(on=' ', values=['Pipeline was failed due to log loss < ', min_validation_accuracy]),
)

In [20]:
condition_step = ConditionStep(
    name='ModelRegistrationConditionStep',
    conditions = [
        ConditionGreaterThanOrEqualTo(
            left=training_step.properties.FinalMetricDataList['validation:accuracy'].Value,
            right=min_validation_accuracy
    )],
    if_steps=[register_model_step],
    else_steps=[fail_step],
)

Bring steps together to form the pipeline

In [21]:
pipeline = Pipeline(
    name='fraud-detection-model-pipeline',
    steps=[preprocessing_step, training_step, condition_step],
    parameters=[tree_max_depth, min_validation_accuracy],
)

In [22]:
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:eu-north-1:263108256547:pipeline/fraud-detection-model-pipeline',
 'ResponseMetadata': {'RequestId': '556b59bd-df2c-41cd-96a8-cc866772b99c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '556b59bd-df2c-41cd-96a8-cc866772b99c',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-options': 'DENY',
   'content-security-policy': "frame-ancestors 'none'",
   'cache-control': 'no-cache, no-store, must-revalidate',
   'x-content-type-options': 'nosniff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '121',
   'date': 'Fri, 24 Oct 2025 14:52:29 GMT'},
  'RetryAttempts': 0}}

In [23]:
# programitically start the pipeline
# pipeline.start(
#     execution_display_name='conditional-model-registration',
#     execution_description='Starting from the SageMaker Studio'
# )

After running in the UI, a 94% validation accuracy was achieved and so this model was registered to the model registry