# Load libraries

In [3]:
import os
from time import gmtime, strftime, sleep
import json

import boto3
import sagemaker
import sagemaker.session

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat
)

from sagemaker import (
    AutoML,
    AutoMLInput,
    # get_execution_role,
    # MetricsSource,
    # ModelMetrics,
    # ModelPackage,
)

# from sagemaker.s3 import s3_path_join, S3Downloader, S3Uploader
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.processing import Processor, ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.inputs import TrainingInput
from sagemaker.transformer import Transformer
from sagemaker.pipeline import PipelineModel
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, TransformStep, CacheConfig
from sagemaker.workflow.automl_step import AutoMLStep
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.functions import Join
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo, ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet
from sagemaker.workflow.properties import PropertyFile


# Define session variables

In [4]:
session = sagemaker.session.Session()
pipe_session= PipelineSession()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
account_id = session.account_id()

pipeline_name = "FileFormatExample"
model_package_group_name = 'ProcessorEstimator'

preprocessor_script = "processor_model.py"
transformer_script = 'transformers.py'

image_name = 'fit-fraudml-sagemaker-collab-20230511151434-35-97f3a42'
ecr_repo = f'{account_id}.dkr.ecr.{region}.amazonaws.com/frm-svcs:{image_name}'

timestamp_suffix = strftime("%Y-%m-%d-%H-%M", gmtime())
prefix = 'experiment' + '_' + timestamp_suffix

tags = [
    {"Key": "business_use", "Value": "sample"}
   ]

# Define Pipeline parameters

In [4]:
# image_name = 'fit-fraudml-sagemaker-collab-20230420153235-31-b24c974'
# ecr_repo = f'{account_id}.dkr.ecr.{region}.amazonaws.com/frm-svcs:{image_name}'

sample_size_param = ParameterString(
    name="SampleSize",
    default_value='10000')

group_filter = ParameterString(
    name='Group',
    default_value='second')

target_col = ParameterString(
    name='Target',
    default_value='target')

train_size = ParameterString(
    name='TrainSize',
    default_value='0.8')

file_format = ParameterString(
    name='FileFormat',
    default_value='csv')

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1)

processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.m5.xlarge")

training_instance_type = ParameterString(
    name='TrainingInstanceType',
    default_value="ml.m5.xlarge")

framework_version = ParameterString(
    name="FrameworkVersion",
    default_value="1.2-1")

max_automl_runtime = ParameterInteger(
    name="MaxAutoMLRuntime",
    default_value=3600)  # max. AutoML training runtime: 1 hour

model_approval_status = ParameterString(
    name='ModelApprovalStatus',
    default_value='PendingManualApproval')

model_registration_metric_threshold = ParameterFloat(
    name="ModelRegistrationMetricThreshold",
    default_value=0.1)

step_cache_config = CacheConfig(
    enable_caching=True,
    expire_after='PT12H')

## Create Sample Features

In [5]:
sklearn_feats_processor = SKLearnProcessor(
    framework_version=framework_version,
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    sagemaker_session=pipe_session,
    base_job_name="sample-pipeline-job"
)

step_create_feats = ProcessingStep(
    name="create_feats",
    processor=sklearn_feats_processor,
    outputs=[
        ProcessingOutput(
            output_name="features",
            source="/opt/ml/processing/output",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'sample_data'
                ],
            ),
        ),
    ],
    job_arguments = ["--sample-size", sample_size_param, '--group', group_filter],
    code="create_feats.py",
    cache_config=step_cache_config
)

The input argument version of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.
The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


## Create Sample Ground Truth

In [6]:
sklearn_gt_processor = SKLearnProcessor(
    framework_version=framework_version,
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    sagemaker_session=pipe_session,
    base_job_name="sample-pipeline-job"
)

step_create_gt = ProcessingStep(
    name="create_gt",
    processor=sklearn_gt_processor,
    outputs=[
        ProcessingOutput(
            output_name="ground_truth",
            source="/opt/ml/processing/output",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'sample_data'
                ],
            ),
        ),
    ],
    job_arguments = ["--sample-size", sample_size_param, '--target', target_col],
    code="create_gt.py",
    cache_config=step_cache_config
)

The input argument version of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.
The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


## Split Data, Train Preprocessor, Save Processed Data

In [7]:
pre_processor = ScriptProcessor(
    command=['python3'],
    image_uri=ecr_repo,
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count)

step_create_preprocessor = ProcessingStep(
    name="create_preprocessor",
    processor=pre_processor,
    code='processor_script.py',
    inputs=[
        ProcessingInput(
            source=step_create_feats.properties.ProcessingOutputConfig.Outputs['features'].S3Output.S3Uri,
            destination='/opt/ml/processing/input/data/feats'),
        ProcessingInput(
            source=step_create_gt.properties.ProcessingOutputConfig.Outputs['ground_truth'].S3Output.S3Uri,
            destination='/opt/ml/processing/input/data/gt')],
    outputs=[
        ProcessingOutput(
            output_name="encoder",
            source="/opt/ml/processing/output/encoder",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'encoder'])),
        ProcessingOutput(
            output_name="encoder_cols",
            source="/opt/ml/processing/output/encoder_cols",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'encoder_cols'])),
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'train'])),
        ProcessingOutput(
            output_name="validate",
            source="/opt/ml/processing/output/validate",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'validate'])),
        # ProcessingOutput(
        #     output_name="test",
        #     source="/opt/ml/processing/output/test",
        #     destination=Join(
        #         on="/",
        #         values=[
        #             "s3://{}".format(bucket),
        #             prefix,
        #             'test'
        #         ],
        #     ),
        # ),
        ProcessingOutput(
            output_name="test_x",
            source="/opt/ml/processing/output/test/feats",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'test',
                    'feats'])),
        ProcessingOutput(
            output_name="test_y",
            source="/opt/ml/processing/output/test/target",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'test',
                    'target']))],
    job_arguments = [
        '--target', target_col,
        "--train-size", train_size,
        '--file-format', file_format],
    cache_config=step_cache_config
)

## Train Preprocessor

In [8]:
processor_model = SKLearn(
    entry_point='processor_model.py',
    role=role,
    instance_type=training_instance_type,
    framework_version=framework_version,
    sagemaker_session=pipe_session,
    dependencies=[transformer_script])

step_train_preprocessor_model = TrainingStep(
    name="PreprocessModel",
    estimator=processor_model,
    inputs={
        'input_model':TrainingInput(
            s3_data=step_create_preprocessor.properties.ProcessingOutputConfig.Outputs['encoder'].S3Output.S3Uri,
            content_type='text/csv')},
    cache_config=step_cache_config
)

instance_type is a PipelineVariable (<class 'sagemaker.workflow.parameters.ParameterString'>). Its interpreted value in execution time should not be of GPU types since GPU training is not supported for Scikit-Learn.
The input argument version of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.
The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


## Train AutoML Model

In [9]:
automl = AutoML(
    role=role,
    target_attribute_name=target_col,
    # problem_type='BinaryClassification',
    # job_objective={'auc':?}, # the objective metric used to measure the predictive quality of an AutoML job. In the format of: {“MetricName”: str}
    sagemaker_session=pipe_session,
    # max_candidates=10, #maximum number of times a training job is allowed to run
    # max_runtime_per_training_job_in_seconds=max_automl_runtime, # maximum time, in seconds, that each training job executed inside hyperparameter tuning is allowed to run as part of a hyperparameter tuning job
    total_job_runtime_in_seconds=max_automl_runtime, # the total wait time of an AutoML job
    # feature_specification_s3_uri=?, # a URL to the Amazon S3 data source containing selected features and specified data types from the input data source of an AutoML job.
    mode="ENSEMBLING",  # only ensembling mode is supported for native AutoML step integration in SageMaker Pipelines
)

train_args = automl.fit(
    inputs=[
        AutoMLInput(
            inputs=step_create_preprocessor.properties.ProcessingOutputConfig.Outputs['train'].S3Output.S3Uri,
            target_attribute_name=target_col,
            channel_type="training"
        ),
        AutoMLInput(
            inputs=step_create_preprocessor.properties.ProcessingOutputConfig.Outputs['validate'].S3Output.S3Uri,
            target_attribute_name=target_col,
            channel_type="validation")
    ]
)

step_auto_ml_training = AutoMLStep(
    name="AutoMLTrainingStep",
    step_args=train_args,
    cache_config=step_cache_config
)

best_auto_ml_model = step_auto_ml_training.get_best_auto_ml_model(
    role,
    sagemaker_session=pipe_session)

step_args_create_model = best_auto_ml_model.create(instance_type=processing_instance_type)
step_create_AutoMLmodel = ModelStep(name="ModelCreationStep", step_args=step_args_create_model)



## Batch Transform Step

In [10]:
transformer = Transformer(
    model_name=step_create_AutoMLmodel.properties.ModelName,
    instance_count=processing_instance_count,
    instance_type=processing_instance_type,
    output_path=Join(on="/", values=["s3:/", bucket, prefix, "transform"]),
    strategy='MultiRecord',
    assemble_with='Line',
    accept = 'text/csv',
    # max_payload=100,
    sagemaker_session=pipe_session,
)
step_batch_transform = TransformStep(
    name="BatchTransformStep",
    step_args=transformer.transform(
        data=step_create_preprocessor.properties.ProcessingOutputConfig.Outputs['test_x'].S3Output.S3Uri,
        content_type="text/csv",
        split_type='Line'),
    cache_config=step_cache_config
)

## Evaluate AutoML Model

Need to update this code. Specifically, need to know the output of the transform_step so that you can reference it in the eval script. Will also need to update the eval script, as the predictions are already complete using this workflow.

In [11]:
eval_processor = SKLearnProcessor(
    framework_version=framework_version,
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count)

The input argument version of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.
The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


In [12]:
evaluation_report = PropertyFile(
    name="evaluation",
    output_name="evaluation_metrics",
    path="evaluation.json"
)

step_evaluation = ProcessingStep(
    name="ModelEvaluationStep",
    processor=eval_processor,
    code="evaluate.py",
    inputs=[
        ProcessingInput(
            source=step_batch_transform.properties.TransformOutput.S3OutputPath,
            destination="/opt/ml/processing/input/predictions"),
        ProcessingInput(
            source=step_create_preprocessor.properties.ProcessingOutputConfig.Outputs['test_y'].S3Output.S3Uri,
            destination="/opt/ml/processing/input/true_labels")],
    outputs=[
        ProcessingOutput(
            output_name="evaluation_metrics",
            source="/opt/ml/processing/evaluation",
            destination=Join(on="/", values=["s3:/", bucket, prefix, "evaluation"]))],
    property_files=[evaluation_report],
    cache_config=step_cache_config)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=Join(
            on='/',
            values=[
                step_evaluation.arguments["ProcessingOutputConfig"]["Outputs"][0]['S3Output']['S3Uri'],
                'evaluation.json']),
        content_type='application/json'),
    explainability=MetricsSource(
        s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ExplainabilityJsonReportPath,
        content_type="application/json",
    )
)

In [13]:
# model_metrics = ModelMetrics(
#     model_statistics=MetricsSource(
#         s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ModelInsightsJsonReportPath,
#         content_type="application/json",
#     ),
#     explainability=MetricsSource(
#         s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ExplainabilityJsonReportPath,
#         content_type="application/json",
#     ),
# )

## Create Inference Pipeline

In [14]:
preprocess_model = SKLearnModel(
    name='PreprocessModel',
    model_data=step_train_preprocessor_model.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    sagemaker_session=pipe_session,
    entry_point=preprocessor_script,
    dependencies=[transformer_script],
    framework_version=framework_version)

In [15]:
model_name = "inference-pipeline-" + timestamp_suffix
endpoint_name = "inference-pipeline-ep-" + timestamp_suffix


pipe_model = PipelineModel(
    models=[preprocess_model, best_auto_ml_model],
    role=role,
    sagemaker_session=pipe_session
)

## Registration Step

In [16]:
step_register_model = RegisterModel(
    name=model_name,
    model=pipe_model,
    content_types=["text/csv", "text/csv"],
    response_types=["text/csv", "text/csv"],
    inference_instances=[processing_instance_type, processing_instance_type],
    transform_instances=[processing_instance_type, processing_instance_type],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics)

# # step_register_model = ModelStep(
# #     name="ModelRegistrationStep",
# #     step_args=step_args_register_model)

The input argument version of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.
The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


In [17]:
step_conditional_registration = ConditionStep(
    name="ConditionalRegistrationStep",
    conditions=[
        ConditionGreaterThanOrEqualTo(
            left=JsonGet(
                step_name=step_evaluation.name,
                property_file=evaluation_report,
                json_path="binary_classification_metrics.accuracy.value",
            ),
            right=model_registration_metric_threshold,
        )
    ],
    if_steps=[step_register_model],
    else_steps=[],  # pipeline end
)

## Define Pipeline

In [18]:
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        sample_size_param,
        group_filter,
        target_col,
        train_size,
        file_format,
        processing_instance_count,
        processing_instance_type,
        training_instance_type,
        framework_version,
        max_automl_runtime,
        model_approval_status,
        model_registration_metric_threshold,
        step_cache_config
    ],
    steps=[
        step_create_feats,
        step_create_gt,
        step_create_preprocessor,
        step_train_preprocessor_model,
        step_auto_ml_training,
        step_create_AutoMLmodel,
        step_batch_transform,
        step_evaluation,
        # step_register_model,
        step_conditional_registration
    ],
    sagemaker_session=pipe_session)

In [19]:
pipeline.upsert(role_arn=role, tags=tags)

pipeline.start(
    execution_display_name="SamplePipe-10MBaseline",
    parameters=dict(
        ProcessingInstanceType='ml.m5.2xlarge',
        TrainingInstanceType='ml.m5.2xlarge',
        MaxAutoMLRuntime=3600,
        SampleSize='10000000'))



_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:707031497630:pipeline/fileformatexample/execution/w4odyhnuechc', sagemaker_session=<sagemaker.workflow.pipeline_context.PipelineSession object at 0x7fb172bbc350>)