# Pipeline

In [None]:
tags = [
    {"Key": "PLATFORM", "Value": "FO-ML"},
    {"Key": "BUSINESS_REGION", "Value": "GLOBAL"},
    {"Key": "BUSINESS_UNIT", "Value": "MOBILITY"},
    {"Key": "CLIENT", "Value": "MULTI_TENANT"}
   ]

In [None]:
# tags = [
#     {"Key": "DATASET", "Value": "InsCOIL"},
#     {"Key": "SOURCE", "Value": "UCI"}
#    ]

## Load libraries

In [None]:
import pandas as pd

import sagemaker
import sagemaker.session
import boto3

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString)

from sagemaker.workflow.functions import Join
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, TuningStep
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.pipeline import Pipeline

from sagemaker.inputs import TrainingInput
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter
from sagemaker import ModelPackage

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.estimator import Estimator

from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor
from sagemaker.model_metrics import MetricsSource, ModelMetrics


## Set AWS parameters

In [None]:
session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

bucket = session.default_bucket()
prefix = '1_ins_dataset/raw'

## Load data to S3

In [None]:
train_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticdata2000.txt'
test_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticeval2000.txt'
gt_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/tictgts2000.txt'
cols_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/dictionary.txt'

train = pd.read_table(train_uri, header=None)
test = pd.read_table(test_uri, header=None)
ground_truth = pd.read_table(gt_uri, header=None)
columns = pd.read_table(cols_uri, encoding='latin-1')

train.to_csv(f's3://{bucket}/{prefix}/train.csv', index=False)
test.to_csv(f's3://{bucket}/{prefix}/test.csv', index=False)
ground_truth.to_csv(f's3://{bucket}/{prefix}/gt.csv', index=False)
columns.to_csv(f's3://{bucket}/{prefix}/col_info.csv', index=False)

## Pipeline parameters

In [None]:
pipeline_name = "InsExample"  # SageMaker Pipeline name
model_package_group_name = "Insurance-Co-Example"  # Model name in model registry
framework_version = "0.23-1"

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)

processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.t3.medium")

training_instance_type = ParameterString(
    name="TrainingInstanceType", default_value="ml.m5.xlarge")

model_approval_status = ParameterString(
    name='ModelApprovalStatus', default_value='PendingManualApproval')

max_training_jobs = ParameterInteger(name='MaximumTrainingJobs', default_value=1)

max_parallel_training_jobs = ParameterInteger('MaxParallelTrainingJobs', default_value=1)

## Processors and Estimators

In [None]:
sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="ins-example-job"
)

image_uri = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=region,
    version='1.2-2',
    py_version='py3',
    instance_type='ml.m5.xlarge')

xgb_estimator = Estimator(
    image_uri=image_uri,
    instance_type=training_instance_type,
    instance_count=1,
    role=role,
    disable_profiler=True,
    # output_path=Join(
    #     on="/",
    #     values=[
    #         "s3://{}".format(bucket),
    #         prefix,
    #         ExecutionVariables.PIPELINE_EXECUTION_ID,
    #         "model"],
    #         )
)

xgb_estimator.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    objective='binary:logistic',
    num_round=25)

xgb_tuner = HyperparameterTuner(
    estimator=xgb_estimator,
    objective_metric_name="validation:f1",
    hyperparameter_ranges={
        'max_depth': IntegerParameter(1, 10),
        'eta': ContinuousParameter(0, 0.5),
        'gamma': ContinuousParameter(0, 5),
        'min_child_weight': ContinuousParameter(1, 120),
        'num_round': IntegerParameter(1, 2000)
    },
    max_jobs=max_training_jobs,
    max_parallel_jobs=max_parallel_training_jobs)

# xgb_tuner = HyperparameterTuner(
#     estimator=xgb_estimator,
#     objective_metric_name="validation:auc",
#     hyperparameter_ranges={
#         'eta': ContinuousParameter(0, 0.5),
#         'alpha': ContinuousParameter(0, 1000),
#         'min_child_weight': ContinuousParameter(1, 120),
#         'max_depth': IntegerParameter(1, 10),
#         'num_round': IntegerParameter(1, 2000),
#         'subsample': ContinuousParameter(0.5, 1)
#     },
#     max_jobs=max_training_jobs,
#     max_parallel_jobs=max_parallel_training_jobs)

evaluate_model_processor = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    role=role,
    base_job_name="ins-example-job")

evaluation_report = PropertyFile(
    name='EvaluationReport',
    output_name='evaluation',
    path='evaluation.json')

## ETL step

In [None]:
input_uri = Join(on="/", values=['s3://{}'.format(bucket),
                                      prefix,
                                      'raw'])

step_etl = ProcessingStep(
    name="etl",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source=input_uri, destination="/opt/ml/processing/input")
    ],
    outputs=[
        ProcessingOutput(
            output_name="clean",
            source="/opt/ml/processing/output",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'clean'
                ]
            )
        )
    ],
    code="etl.py"
)



## Preprocess step

In [None]:
input_uri = f's3://{bucket}/{prefix}/clean/full_data.csv'

input_data = ParameterString(
    name="InputData",
    default_value=input_uri
)

step_preprocess = ProcessingStep(
    name="preprocess_data",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input")
    ],
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'final',
                    "train"
                ],
            ),
        ),
        ProcessingOutput(
            output_name="validate",
            source="/opt/ml/processing/output/validate",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'final',
                    "validate"
                ],
            ),
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/output/test",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'final',
                    "test"
                ],
            ),
        ),
        ProcessingOutput(
            output_name="encoder",
            source="/opt/ml/processing/output/encoder",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'final',
                    'encoder'
                ],
            ),
        ),
    ],
    code="preprocess.py"
)

## Train Prebuilt model

### Do one and only one of the following:

- Prebuilt Training step
- Hyperparameter tuning/training step
- AutoPilot step

In [None]:
train_uri = f's3://{bucket}/{prefix}/final/train/train_feats.csv'
validate_uri = f's3://{bucket}/{prefix}/final/validate/validate_feats.csv'
test_uri = f's3://{bucket}/{prefix}/final/test/test_feats.csv'

train_data = ParameterString(
    name="TrainData",
    default_value=train_uri
)
validate_data = ParameterString(
    name="ValidateData",
    default_value=validate_uri
)
test_data = ParameterString(
    name="TestData",
    default_value=test_uri
)

step_train = TrainingStep(
    name='train_model',
    estimator=xgb_estimator,
    inputs={
        'train':TrainingInput(
            s3_data=train_data,
            content_type='text/csv'),
        'validation':TrainingInput(
            s3_data=validate_data,
            content_type='text/csv')
            })

## Hyperparameter tuning/training step

### Do one and only one of the following:

- Prebuilt Training step
- Hyperparameter tuning/training step
- AutoPilot step

In [None]:
train_uri = f's3://{bucket}/{prefix}/final/train/train_feats.csv'
validate_uri = f's3://{bucket}/{prefix}/final/validate/validate_feats.csv'
test_uri = f's3://{bucket}/{prefix}/final/test/test_feats.csv'
    
train_data = ParameterString(
    name="TrainData",
    default_value=train_uri
)
validate_data = ParameterString(
    name="ValidateData",
    default_value=validate_uri
)
test_data = ParameterString(
    name="TestData",
    default_value=test_uri
)

step_tune = TuningStep(
    name='train-tune-model',
    tuner=xgb_tuner,
    inputs={
        'train':TrainingInput(
            s3_data=train_data,
            content_type='text/csv'),
        'validation':TrainingInput(
            s3_data=validate_data,
            content_type='text/csv')})

## Autopilot step

In [None]:
input_data_config = [
    {
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": "s3://{}/{}/train".format(bucket, prefix),
            }
        },
        "TargetAttributeName": target,
    }
]

job_config = {"CompletionCriteria": {"MaxCandidates": 10}}


output_data_config = {"S3OutputPath": "s3://{}/{}/output".format(bucket, prefix)}

from time import gmtime, strftime, sleep

timestamp_suffix = strftime("%Y%m%d-%H-%M", gmtime())

auto_ml_job_name = "ins-example-" + timestamp_suffix
print("AutoMLJobName: " + auto_ml_job_name)

sm.create_auto_ml_job(
    AutoMLJobName=auto_ml_job_name,
    InputDataConfig=input_data_config,
    OutputDataConfig=output_data_config,
    AutoMLJobConfig=job_config,
    # Uncomment to automatically deploy an endpoint
    # ModelDeployConfig={
    #'AutoGenerateEndpointName': True,
    #'EndpointName': 'autopilot-DEMO-housing-' + timestamp_suffix
    # },
    RoleArn=role,
)

In [None]:
automl = AutoML(
    role=execution_role,
    target_attribute_name=target_attribute_name,
    sagemaker_session=pipeline_session,
    total_job_runtime_in_seconds=max_automl_runtime,
    mode="ENSEMBLING",  # only ensembling mode is supported for native AutoML step integration in SageMaker Pipelines
)
train_args = automl.fit(
    inputs=[
        AutoMLInput(
            inputs=s3_train_val,
            target_attribute_name=target_attribute_name,
            channel_type="training",
        )
    ]
)

step_automl = AutoMLStep(
    name="AutoMLTrainingStep",
    step_args=train_args,
)

# best_auto_ml_model = step_auto_ml_training.get_best_auto_ml_model(
#     execution_role, sagemaker_session=pipeline_session
# )
# step_args_create_model = best_auto_ml_model.create(instance_type=instance_type)
# step_create_model = ModelStep(name="ModelCreationStep", step_args=step_args_create_model)

## Evaluation step

In [None]:
step_evaluate = ProcessingStep(
    name='evaluate_model',
    processor=evaluate_model_processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            # source=step_tune.get_top_model_s3_uri(top_k=0, s3_bucket=bucket),
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=test_data,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs = [
        ProcessingOutput(
            output_name='evaluation',
            source='/opt/ml/processing/evaluation',
            destination=Join(
                on='/',
                values=[
                    's3://{}'.format(bucket),
                    prefix,
                    ExecutionVariables.PIPELINE_EXECUTION_ID,
                    'evaluation-report']
            )
        )
    ],
    code='evaluate_extd.py',
    property_files=[evaluation_report]
)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=Join(
            on='/',
            values=[
                step_evaluate.arguments["ProcessingOutputConfig"]["Outputs"][0]['S3Output']['S3Uri'],
                'evaluation.json']
        ),
        content_type='application/json')
)

## Inference pipe step

## Register model step

In [None]:
step_register = RegisterModel(
    name='register-model',
    estimator=xgb_estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    # model_data=step_tune.get_top_model_s3_uri(top_k=0, s3_bucket=bucket),
    content_types=['text/csv'],
    response_types=['text/csv'],
    inference_instances=['ml.t2.medium', 'ml.m5.xlarge', 'ml.m5.large'],
    transform_instances=['ml.m5.xlarge'],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics)

## Define pipe

In [None]:
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type,
        processing_instance_count,
        input_data,
        training_instance_type,
        train_data,
        validate_data,
        test_data,
        model_approval_status
    ],
    steps=[step_etl,
           step_preprocess,
           step_train,
           # step_tune,
           # step_autopilot,
           # step_automl,
           step_evaluate,
           step_register])

## Update and run pipe

In [None]:
pipeline.upsert(role_arn=role, tags=tags)

pipeline.start(
    execution_display_name="InsPipe1")