# Load Data

In [7]:
import boto3


# prints all objects in the sagemaker bucket with object key prefix of datasets/
region = boto3.Session().region_name
s3 = boto3.client('s3')

paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=f"sagemaker-example-files-prod-{region}", Prefix="datasets/tabular")

for page in pages:
    for obj in page.get('Contents', []):
        print(obj['Key'])


datasets/tabular/anomaly_benchmark_taxi/NAB_nyc_taxi.csv
datasets/tabular/atlas_higgs_boson_2014/atlas-higgs-challenge-2014-v2.csv
datasets/tabular/brazil_houses/kaggle_brazil_houses_rental_data.csv
datasets/tabular/breast_cancer/breast-cancer-wisconsin.csv
datasets/tabular/breast_cancer/wdbc.csv
datasets/tabular/california_housing/cal_housing.tgz
datasets/tabular/chicago_traffic/README.md
datasets/tabular/chicago_traffic/speed_camera_violations.csv
datasets/tabular/customer-churn/customer-churn-data-v1.zip
datasets/tabular/customer-churn/customer-churn-data-v2.zip
datasets/tabular/customer-churn/customer-churn-data.zip
datasets/tabular/dirty-titanic/titanic-dirty-4.csv
datasets/tabular/fleet-predictive-maintenance/example_fleet_info.csv
datasets/tabular/fleet-predictive-maintenance/example_fleet_sensor_logs.csv
datasets/tabular/fraud_detection/synthethic_fraud_detection_SA/churn.txt
datasets/tabular/fraud_detection/synthethic_fraud_detection_SA/identity.csv
datasets/tabular/fraud_dete

In [8]:
REGION = boto3.Session().region_name
DATA_HOST = f"sagemaker-example-files-prod-{REGION}"
DATA_PATH = "datasets/tabular/fraud_detection/synthethic_fraud_detection_SA/"
ARCHIVE_NAME = "transaction.csv"

In [3]:
# download dataset to notebook instance
s3 = boto3.client('s3')
s3.download_file(DATA_HOST,DATA_PATH+ARCHIVE_NAME,ARCHIVE_NAME)

In [9]:
bucket = 'sagemaker-fraud-detection-ml'
s3.upload_file('transaction.csv', 'raw_data.csv')

# Define a Pipeline
The pipeline will include a preprocessing step that will assume input data is in a particular format and then conduct the following steps:
* Clean data
* Engineer new features
* Perform train-test split

Addiitionally, this pipeline will enable users to insert training parameters at the time they execute the pipeline - the parameters are this way not baked into the code. The parameters will be:
* **Tree Max Depth**; max depth of our XGBoost trees can be
* **Validation Loss**; the maximum loss, from running inference against the validation dataset, that permits the model being registered to the model registry

In [20]:
import sagemaker

from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import PipelineSession

from sagemaker.workflow.steps import ProcessingStep
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import (
    ProcessingInput,
    ProcessingOutput
)

from sagemaker.inputs import TrainingInput

from sagemaker.workflow.steps import TrainingStep
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.condition_step import ConditionStep

from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.functions import Join

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
    ParameterBoolean
)

In [21]:
# high-level
session = sagemaker.Session()
role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()
bucket = 'sagemaker-fraud-detection-ml'

# pipeline parameters
tree_max_depth = ParameterInteger(
    name='TreeMaxDepth',
    default_value=5
)
validation_loss = ParameterFloat(
    name='ValidationLoss',
    default_value=0.2
)

Define estimator to run in a pipeline session

In [22]:
xgb_image = sagemaker.image_uris.retrieve('xgboost', session.boto_region_name, '1.5-1')
estimator = sagemaker.estimator.Estimator(
    xgb_image,
    role,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=f's3://{bucket}/output',
    sagemaker_session=pipeline_session,
)
estimator.set_hyperparameters(
    max_depth=tree_max_depth,
    objective='binary:logistic',
    num_round=5,
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Define each step of pipeline individually. Note, there is no sklearn dependencies in our processing script but the image used by the SKLearnProcessor includes the pandas library already installed - and this is an explicit dependency for our processing script. 

In [23]:
processor = SKLearnProcessor(
    framework_version='1.0-1',
    role=role,
    instance_type='ml.m5.large',
    instance_count=1                          
)

preprocessing_step = ProcessingStep(
    name='FraudDetectionProcess',
    processor=processor,
    code='data_processing.py',
    inputs=[
    
    # define where the processor needs to look to find raw data 
        ProcessingInput(
            input_name='raw-data',
            source=f's3://{bucket}/raw_data.csv',
            destination='/opt/ml/processing/input/data/',
            s3_data_distribution_type='ShardedByS3Key'
        )
    ],

    # define where the processor needs to look to find processed data to upload to s3
    outputs=[
        ProcessingOutput(
            output_name='train',
            source='/opt/ml/processing/output/train',
            s3_upload_mode='EndOfJob'
        ),
        ProcessingOutput(
            output_name='validation',
            source='/opt/ml/processing/output/validation',
            s3_upload_mode='EndOfJob'
        ),
        ProcessingOutput(
            output_name='test',
            source='/opt/ml/processing/output/test',
            s3_upload_mode='EndOfJob'
        ),
    ],
    job_arguments=['--input-data', '/opt/ml/processing/input/data/'],
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [24]:
# training inputs are dynamically linked to s3 outputs from the preprocessing step

s3_input_train = TrainingInput(
    s3_data=Join(
        on='/', 
        values=[
            preprocessing_step.properties.ProcessingOutputConfig.Outputs['train'].S3Output.S3Uri,
            'train.csv'
        ]
    ),
    content_type='csv'
)
s3_input_validate = TrainingInput(
    s3_data=Join(
        on='/',
        values=[
            preprocessing_step.properties.ProcessingOutputConfig.Outputs['validation'].S3Output.S3Uri,
            'validation.csv'
        ]
    ),
    content_type='csv'
)

In [25]:
training_step = TrainingStep(
    name='TrainingStep',
    step_args=estimator.fit(
        inputs={
            'train': s3_input_train,
            'validation': s3_input_validate
        },
    )
)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.


In [26]:
from sagemaker.model import Model
from sagemaker.workflow.model_step import ModelStep

model = Model(
    image_uri=xgb_image,
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=pipeline_session,
    role=role,
)

register_args = model.register(
    content_types=['text/csv'],
    response_types=['text/csv'],
    inference_instances=['ml.m5.large'],
    transform_instances=['ml.m5.large'],
    model_package_group_name='fraud-detection-model-group',
    approval_status='PendingManualApproval'
)

register_model_step = ModelStep(
    name='FraudDetectionRegisterModel',
    step_args=register_args,
)

Fail the pipeline in the event the validation loss is too high

In [27]:
fail_step = FailStep(
    name='FailStep',
    error_message=Join(on=' ', values=['Pipeline was failed due to log loss >= ', validation_loss]),
)

In [28]:
condition_step = ConditionStep(
    name='ModelRegistrationConditionStep',
    conditions = [
        ConditionLessThanOrEqualTo(
            left=training_step.properties.FinalMetricDataList['validation:logloss'].Value,
            right=validation_loss
    )],
    if_steps=[register_model_step],
    else_steps=[fail_step],
)

Bring steps together to form the pipeline

In [29]:
pipeline = Pipeline(
    name='fraud-detection-model-pipeline',
    steps=[preprocessing_step, training_step, condition_step],
    parameters=[tree_max_depth, validation_loss],
)

In [30]:
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:eu-north-1:263108256547:pipeline/fraud-detection-model-pipeline',
 'ResponseMetadata': {'RequestId': 'b52cc135-fc81-40d6-8901-d16c8f1e66ef',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b52cc135-fc81-40d6-8901-d16c8f1e66ef',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-options': 'DENY',
   'content-security-policy': "frame-ancestors 'none'",
   'cache-control': 'no-cache, no-store, must-revalidate',
   'x-content-type-options': 'nosniff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '121',
   'date': 'Fri, 24 Oct 2025 08:35:53 GMT'},
  'RetryAttempts': 0}}

In [56]:
# programitically start the pipeline
pipeline.start(
    execution_display_name='conditional-model-registration',
    execution_description='Starting from the SageMaker Studio'
)

_PipelineExecution(arn='arn:aws:sagemaker:eu-north-1:263108256547:pipeline/fraud-detection-model-pipeline/execution/vorq2e7i21r5', sagemaker_session=<sagemaker.session.Session object at 0x7f5a98f42660>)