In [2]:
import pandas as pd
import numpy as np
import json
import time
import datetime
import random

import sagemaker
import sagemaker.session

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString
)

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.functions import Join
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.sklearn.estimator import SKLearn
# import sagemaker_containers

from sagemaker.predictor import Predictor
from sagemaker.s3 import s3_path_join, S3Downloader, S3Uploader
from sagemaker.serializers import CSVSerializer
from sagemaker.transformer import Transformer
from sagemaker.workflow.automl_step import AutoMLStep
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import Join, JsonGet
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.parameters import ParameterFloat, ParameterInteger, ParameterString
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.steps import TransformStep, TransformInput

from sagemaker.workflow.pipeline import Pipeline

import os
from sklearn.model_selection import train_test_split
from time import gmtime, strftime, sleep
import boto3
import joblib

In [3]:
session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = 'custom_preprocessing'

pipeline_session = PipelineSession()
sagemaker_client = boto3.client("sagemaker")

# timestamp_suffix = strftime("%Y-%m-%d-%H%M%S", gmtime())
timestamp_suffix = strftime("%Y-%m-%d", gmtime())
folder_name = prefix + '-' + timestamp_suffix
prefix_path = f's3://{bucket}/{folder_name}'

In [None]:
train_input = f's3://{bucket}/{prefix}/sample_data/train.csv'
test_input = f's3://{bucket}/{prefix}/sample_data/test.csv'
validate_input = f's3://{bucket}/{prefix}/sample_data/validate.csv'

In [4]:
tags = [
    {"Key": "PLATFORM", "Value": "FO-ML"},
    {"Key": "BUSINESS_REGION", "Value": "GLOBAL"},
    {"Key": "BUSINESS_UNIT", "Value": "MOBILITY"},
    {"Key": "CLIENT", "Value": "MULTI_TENANT"}
   ]

In [None]:
model_package_group_name = "Inference Pipe"  # Model name in model registry
prep_pipeline_name = "DataPrepPipe"
inf_pipeline_name = "InferencePipe"  # SageMaker Pipeline name

In [None]:
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.m5.xlarge")

max_automl_runtime = ParameterInteger(
    name="MaxAutoMLRuntime", default_value=3600
)  # max. AutoML training runtime: 1 hour
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved")
s3_bucket = ParameterString(name="S3Bucket", default_value=pipeline_session.default_bucket())
target_attribute_name = ParameterString(name="TargetAttributeName", default_value="target")

In [None]:
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="data-prep-pipeline-job",
    tags=tags
)

In [None]:
script_path = "processor_script.py"
model_output_path = os.path.join('s3://', bucket, folder_name, "components")


In [None]:
sklearn_transformer = SKLearn(
    entry_point=script_path,
    role=role,
    output_path=model_output_path,
    instance_type="ml.m5.large",
    sagemaker_session=None,
    framework_version="1.0-1",
    py_version="py3",
    tags=tags,
    dependencies=['transformers.py']
)

In [None]:
sklearn_transformer.fit({"train": train_input})

In [None]:
transformer_prefix = os.path.join(folder_name,
                                  "components",
                                  sklearn_transformer.latest_training_job.job_name,
                                  "output",
                                  "model.tar.gz")

session.download_data(path='./', bucket=bucket, key_prefix=transformer_prefix)

## Batch transform

In [None]:
transformer_output = os.path.join("s3://", bucket, folder_name, "Feature_selection_output/")
transformer = sklearn_transformer.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=transformer_output,
    assemble_with="Line",
    accept="text/csv",
    role=role,
    tags=tags
)

In [None]:
transformer.transform(train_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

# Batch Trasform Step

In [None]:
transformer = Transformer(
    model_name=step_create_model.properties.ModelName,
    instance_count=instance_count,
    instance_type=instance_type,
    output_path=Join(on="/", values=["s3:/", s3_bucket, output_prefix, "transform"]),
    sagemaker_session=pipeline_session,
)
step_batch_transform = TransformStep(
    name="BatchTransformStep",
    step_args=transformer.transform(data=s3_x_test, content_type="text/csv"),
)

# AutoML Training Step

In [None]:
automl = AutoML(
    role=role,
    target_attribute_name=target_attribute_name,
    sagemaker_session=pipeline_session,
    total_job_runtime_in_seconds=max_automl_runtime,
    mode="ENSEMBLING",  # only ensembling mode is supported for native AutoML step integration in SageMaker Pipelines
)
train_args = automl.fit(
    inputs=[
        AutoMLInput(
            inputs=s3_train_val,
            target_attribute_name=target_attribute_name,
            channel_type="training",
        )
    ]
)

In [None]:
step_auto_ml_training = AutoMLStep(
    name="AutoMLTrainingStep",
    step_args=train_args,
)

In [None]:
best_auto_ml_model = step_auto_ml_training.get_best_auto_ml_model(
    role=role, sagemaker_session=pipeline_session
)
step_args_create_model = best_auto_ml_model.create(instance_type=processing_instance_type)
step_create_model = ModelStep(name="ModelCreationStep", step_args=step_args_create_model)

# Batch Trasform Step

In [None]:
transformer = Transformer(
    model_name=step_create_model.properties.ModelName,
    instance_count=instance_count,
    instance_type=instance_type,
    output_path=Join(on="/", values=["s3:/", s3_bucket, output_prefix, "transform"]),
    sagemaker_session=pipeline_session,
)
step_batch_transform = TransformStep(
    name="BatchTransformStep",
    step_args=transformer.transform(data=s3_x_test, content_type="text/csv"),
)

# Evaluation Step

From amazon-sagemaker-examples/sagemaker-pipelines/tabular/automl-step/sagemaker_autopilot_pipelines_native_auto_ml_step.ipynb

In [None]:
evaluation_report = PropertyFile(
    name="evaluation", output_name="evaluation_metrics", path="evaluation_metrics.json"
)

In [None]:
sklearn_processor = SKLearnProcessor(
    role=execution_role,
    framework_version="1.0-1",
    instance_count=instance_count,
    instance_type=instance_type.default_value,
    sagemaker_session=pipeline_session,
)
step_args_sklearn_processor = sklearn_processor.run(
    inputs=[
        ProcessingInput(
            source=step_batch_transform.properties.TransformOutput.S3OutputPath,
            destination="/opt/ml/processing/input/predictions",
        ),
        ProcessingInput(source=s3_y_test, destination="/opt/ml/processing/input/true_labels"),
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation_metrics",
            source="/opt/ml/processing/evaluation",
            destination=Join(on="/", values=["s3:/", s3_bucket, output_prefix, "evaluation"]),
        ),
    ],
    code="evaluation.py",
)
step_evaluation = ProcessingStep(
    name="ModelEvaluationStep",
    step_args=step_args_sklearn_processor,
    property_files=[evaluation_report],
)

From me, this assumes use of XGBoost algorithm

In [None]:
image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.2-2",
    py_version="py3",
    instance_type="ml.m5.xlarge",
)

In [None]:
evaluate_model_processor = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="pipeline-trial-job",
    role=role,
)

# Create a PropertyFile
# A PropertyFile is used to be able to reference outputs from a processing step, for instance to use in a condition step.
# For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html
evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)

# Use the evaluate_model_processor in a Sagemaker pipelines ProcessingStep.
step_evaluate_model = ProcessingStep(
    name="evaluate_model",
    processor=evaluate_model_processor,
    inputs=[
        ProcessingInput(
            source=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=test_data_uri,  # Use pre-created test data instead of output from processing step
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/evaluation",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    ExecutionVariables.PIPELINE_EXECUTION_ID,
                    "evaluation-report",
                ],
            ),
        ),
    ],
    code="evaluate.py",
    property_files=[evaluation_report],
)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=Join(
            on='/',
            values=[
                step_evaluate.arguments["ProcessingOutputConfig"]["Outputs"][0]['S3Output']['S3Uri'],
                'evaluation.json']
        ),
        content_type='application/json')
)

# Conditional Registration Step

In [None]:
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ModelInsightsJsonReportPath,
        content_type="application/json",
    ),
    explainability=MetricsSource(
        s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ExplainabilityJsonReportPath,
        content_type="application/json",
    ),
)
step_args_register_model = best_auto_ml_model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=[instance_type],
    transform_instances=[instance_type],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics,
)
step_register_model = ModelStep(name="ModelRegistrationStep", step_args=step_args_register_model)

In [None]:
step_conditional_registration = ConditionStep(
    name="ConditionalRegistrationStep",
    conditions=[
        ConditionGreaterThanOrEqualTo(
            left=JsonGet(
                step_name=step_evaluation.name,
                property_file=evaluation_report,
                json_path="classification_metrics.weighted_f1.value",
            ),
            right=model_registration_metric_threshold,
        )
    ],
    if_steps=[step_register_model],
    else_steps=[],  # pipeline end
)

# Define and Run Pipe

In [None]:
pipeline = Pipeline(
    name="AutoMLTrainingPipeline",
    parameters=[
        instance_count,
        instance_type,
        max_automl_runtime,
        model_approval_status,
        model_package_group_name,
        model_registration_metric_threshold,
        s3_bucket,
        target_attribute_name,
    ],
    steps=[
        step_auto_ml_training,
        step_create_model,
        step_batch_transform,
        step_evaluation,
        step_conditional_registration,
    ],
    sagemaker_session=pipeline_session,
)

In [None]:
pipeline.upsert(role_arn=role, tags=tags)

pipeline.start(
    execution_display_name="AutoMLPipe1")