# Hybrid Notebook/Pipe

Code adapted from the [Custom Feature Selection example](https://github.com/aws/amazon-sagemaker-examples/blob/main/autopilot/custom-feature-selection/Feature_selection_autopilot.ipynb).

In [None]:
import os
import pandas as pd
import numpy as np
import datetime
import random

import sagemaker
import sagemaker.session

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString
)

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.functions import Join
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.sklearn.estimator import SKLearn
# import sagemaker_containers

from sagemaker.workflow.pipeline import Pipeline

import os
from sklearn.model_selection import train_test_split
from time import gmtime, strftime, sleep
import boto3
import joblib

In [None]:
session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = 'custom_preprocessing'

timestamp_suffix = strftime("%Y-%m-%d-%H%M%S", gmtime())
folder_name = prefix + '-' + timestamp_suffix
prefix_path = f's3://{bucket}/{folder_name}'

model_output_path = os.path.join('s3://', bucket, folder_name, "components")

In [None]:
tags = [
    {"Key": "PLATFORM", "Value": "FO-ML"},
    {"Key": "BUSINESS_REGION", "Value": "GLOBAL"},
    {"Key": "BUSINESS_UNIT", "Value": "MOBILITY"},
    {"Key": "CLIENT", "Value": "MULTI_TENANT"}
   ]

# Create Sample Data

In [None]:
pipeline_name = "SampleDataPipe"

### Pipeline Parameters

In [None]:
sample_size = '10000'

sample_size_param = ParameterString(name="--sample-size", default_value='10000')

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)

processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.m5.xlarge")

In [None]:
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="sample-pipeline-job"
)

### Pipeline Steps

In [None]:
step_create_data = ProcessingStep(
    name="create_data",
    processor=sklearn_processor,
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    folder_name,
                    'sample_data',
                    'train'
                ],
            ),
        ),
        ProcessingOutput(
            output_name="validate",
            source="/opt/ml/processing/output/validate",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    folder_name,
                    'sample_data',
                    'validate'
                ],
            ),
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/output/test",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    folder_name,
                    'sample_data',
                    'test'
                ],
            ),
        )
    ],
    job_arguments = ["--sample-size", sample_size_param],
    code="create_data.py",
)

### Pipeline Definition

In [None]:
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        sample_size_param,
        processing_instance_type,
        processing_instance_count
    ],
    steps=[step_create_data])

### Create and Run Pipeline

In [None]:
pipeline.upsert(role_arn=role, tags=tags)

pipeline.start(parameters = {'--sample-size': sample_size},
               execution_display_name="SampleDataPipe")

# Train Preprocessor

In [None]:
train_path = folder_name + 'sample_data' + 'train' + 'sample.csv'
validate_path = folder_name + 'sample_data' + 'validate' + 'sample.csv'
test_path = folder_name + 'sample_data' + 'test' + 'sample.csv'

In [None]:
train_input = session.upload_data('sample.csv',
                                  bucket=bucket,
                                  key_prefix=train_path)
validate_input = session.upload_data('sample.csv',
                                  bucket=bucket,
                                  key_prefix=validate_path)
test_input = session.upload_data('sample.csv',
                                  bucket=bucket,
                                  key_prefix=test_path)

In [None]:
script_path = "processor_script.py"
model_output_path = os.path.join('s3://', bucket, folder_name, "components")

sklearn_transformer = SKLearn(
    entry_point=script_path,
    role=role,
    output_path=model_output_path,
    instance_type="ml.m5.large",
    sagemaker_session=None,
    framework_version="1.0-1",
    py_version="py3",
    tags=tags,
    dependencies=['transformers.py']
)

In [None]:
sklearn_transformer.fit({"train": train_input})

# Batch Transform Data

In [None]:
transformer_prefix = os.path.join(folder_name,
                                  "components",
                                  sklearn_transformer.latest_training_job.job_name,
                                  "output",
                                  "model.tar.gz")

session.download_data(path='./', bucket=bucket, key_prefix=transformer_prefix)

In [None]:
!tar xvzf model.tar.gz

In [None]:
feature_list = list(joblib.load("feature_names.joblib"))
print(feature_list)

In [None]:
joblib.load("preprocessor.joblib")

In [None]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer_output = os.path.join("s3://", bucket, folder_name, "Feature_selection_output/")
transformer = sklearn_transformer.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=transformer_output,
    assemble_with="Line",
    accept="text/csv",
    role=role,
    tags=tags
)

In [None]:
# Preprocess training input
transformer.transform(train_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

## Save batch transformed data

In [None]:
transformer_output_path = os.path.join(transformer.output_path)

key_prefix = (
    transformer_output_path[transformer_output_path.find(bucket) + len(bucket) + 1 :]
    + "sample.csv.out"
)
print(transformer_output_path)

session.download_data(path="./", bucket=bucket, key_prefix=key_prefix)
df_new = pd.read_csv("sample.csv.out", header=None)

# first column is the target variable
df_new.columns = feature_list

# Set up and Kick off Autopilot Job

In [None]:
input_data_config = [
    {
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": "s3://{}/{}/{}/training_data_new".format(bucket, folder_name, "components"),
            }
        },
        "TargetAttributeName": "target",
    }
]

output_data_config = {"S3OutputPath": "s3://{}/{}/{}/autopilot_job_output".format(bucket, folder_name, "components")}

AutoML_Job_Config = {
    "CompletionCriteria": {
        # we set MaxCandidate to 50 to have shorter run time. Please adjust this for your use case.
        "MaxCandidates": 50,
        "MaxAutoMLJobRuntimeInSeconds": 1800,
    }
}

In [None]:
sm = boto3.Session().client(service_name="sagemaker", region_name=region)
# timestamp_suffix = strftime("%d-%H-%M-%S", gmtime())

auto_ml_job_name = "automl-test-" + timestamp_suffix
print("AutoMLJobName: " + auto_ml_job_name)

sm.create_auto_ml_job(
    AutoMLJobName=auto_ml_job_name,
    InputDataConfig=input_data_config,
    OutputDataConfig=output_data_config,
    AutoMLJobConfig=AutoML_Job_Config,
    RoleArn=role,
    Tags=tags
)

In [None]:
print("JobStatus - Secondary Status")
print("------------------------------")


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print(describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"])
job_run_status = describe_response["AutoMLJobStatus"]

while job_run_status not in ("Failed", "Completed", "Stopped"):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response["AutoMLJobStatus"]

    print(
        describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"]
    )
    sleep(30)

## Results

In [None]:
from IPython.display import JSON

best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)["BestCandidate"]
best_candidate_name = best_candidate["CandidateName"]

print("\n")
print("CandidateName: " + best_candidate_name)
print("CandidateName Steps: " + best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"])
print(
    "FinalAutoMLJobObjectiveMetricName: "
    + best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"]
)
print(
    "FinalAutoMLJobObjectiveMetricValue: "
    + str(best_candidate["FinalAutoMLJobObjectiveMetric"]["Value"])
)

In [None]:
sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)

In [None]:
# sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
# sm.list_auto_ml_jobs()
sm_dict = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name)

In [None]:
for item in sm_dict["Candidates"]:
    print(item["CandidateName"])

In [None]:
print(best_candidate["CandidateName"])

In [None]:
for item in sm_dict["Candidates"]:
    if item['ObjectiveStatus'] != 'Failed':
        print(item["CandidateName"], item["FinalAutoMLJobObjectiveMetric"])
        print(item["InferenceContainers"][1]["Image"], "\n")

# Set up the inference pipeline

In [None]:
from botocore.exceptions import ClientError

sagemaker = boto3.client("sagemaker")
import time
from datetime import datetime

# time_stamp = datetime.now().strftime("%m-%d-%Y-%I-%M-%S-%p")
# timestamp_suffix

pipeline_name = "pipeline-test-" + timestamp_suffix
pipeline_endpoint_config_name = "pipeline-test-endpoint-config-" + timestamp_suffix
pipeline_endpoint_name = "pipeline-test-endpoint-" + timestamp_suffix

sklearn_image = sklearn_transformer.image_uri
container_1_source = sklearn_transformer.latest_training_job.describe()["HyperParameters"][
    "sagemaker_submit_directory"
][1:-1]
inference_containers = [
    {
        "Image": sklearn_image,
        "ModelDataUrl": sklearn_transformer.model_data,
        "Environment": {
            "SAGEMAKER_SUBMIT_DIRECTORY": container_1_source,
            "SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv",
            "SAGEMAKER_PROGRAM": "sklearn_feature_selection.py",
        },
    }
]

inference_containers.extend(best_candidate["InferenceContainers"])

response = sagemaker.create_model(
    ModelName=pipeline_name, Containers=inference_containers, ExecutionRoleArn=role
)

In [None]:
try:
    response = sagemaker.create_endpoint_config(
        EndpointConfigName=pipeline_endpoint_config_name,
        ProductionVariants=[
            {
                "VariantName": "DefaultVariant",
                "ModelName": pipeline_name,
                "InitialInstanceCount": 1,
                "InstanceType": "ml.m4.xlarge",
            },
        ],
    )
    print("{}\n".format(response))

except ClientError:
    print("Endpoint config already exists, continuing...")


try:
    response = sagemaker.create_endpoint(
        EndpointName=pipeline_endpoint_name,
        EndpointConfigName=pipeline_endpoint_config_name,
    )
    print("{}\n".format(response))

except ClientError:
    print("Endpoint already exists, continuing...")


# Monitor the status until completed
endpoint_status = sagemaker.describe_endpoint(EndpointName=pipeline_endpoint_name)["EndpointStatus"]
while endpoint_status not in ("OutOfService", "InService", "Failed"):
    endpoint_status = sagemaker.describe_endpoint(EndpointName=pipeline_endpoint_name)[
        "EndpointStatus"
    ]
    print(endpoint_status)
    time.sleep(30)

In [None]:
from botocore.exceptions import ClientError

sagemaker = boto3.client("sagemaker")
import time
from datetime import datetime

# time_stamp = datetime.now().strftime("%m-%d-%Y-%I-%M-%S-%p")
# timestamp_suffix

pipeline_name = "pipeline-test-" + timestamp_suffix
pipeline_endpoint_config_name = "pipeline-test-endpoint-config-" + timestamp_suffix
pipeline_endpoint_name = "pipeline-test-endpoint-" + timestamp_suffix

sklearn_image = sklearn_transformer.image_uri
container_1_source = sklearn_transformer.latest_training_job.describe()["HyperParameters"][
    "sagemaker_submit_directory"
][1:-1]
inference_containers = [
    {
        "Image": sklearn_image,
        "ModelDataUrl": sklearn_transformer.model_data,
        "Environment": {
            "SAGEMAKER_SUBMIT_DIRECTORY": container_1_source,
            "SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv",
            "SAGEMAKER_PROGRAM": "sklearn_feature_selection.py",
        },
    }
]

inference_containers.extend(best_candidate["InferenceContainers"])

response = sagemaker.create_model(
    ModelName=pipeline_name, Containers=inference_containers, ExecutionRoleArn=role
)

In [None]:
try:
    response = sagemaker.create_endpoint_config(
        EndpointConfigName=pipeline_endpoint_config_name,
        ProductionVariants=[
            {
                "VariantName": "DefaultVariant",
                "ModelName": pipeline_name,
                "InitialInstanceCount": 1,
                "InstanceType": "ml.m4.xlarge",
            },
        ],
    )
    print("{}\n".format(response))

except ClientError:
    print("Endpoint config already exists, continuing...")


try:
    response = sagemaker.create_endpoint(
        EndpointName=pipeline_endpoint_name,
        EndpointConfigName=pipeline_endpoint_config_name,
    )
    print("{}\n".format(response))

except ClientError:
    print("Endpoint already exists, continuing...")


# Monitor the status until completed
endpoint_status = sagemaker.describe_endpoint(EndpointName=pipeline_endpoint_name)["EndpointStatus"]
while endpoint_status not in ("OutOfService", "InService", "Failed"):
    endpoint_status = sagemaker.describe_endpoint(EndpointName=pipeline_endpoint_name)[
        "EndpointStatus"
    ]
    print(endpoint_status)
    time.sleep(30)

# Make a Request to Inference Endpoint

In [None]:
test_data = sample_df.iloc[0:5, :-1]
print(test_data)

In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import IdentitySerializer
from sagemaker.deserializers import CSVDeserializer

predictor = Predictor(
    endpoint_name=pipeline_endpoint_name,
    sagemaker_session=session,
    serializer=IdentitySerializer(content_type="text/csv"),
    deserializer=CSVDeserializer(),
)

predictor.predict(test_data.to_csv(sep=",", header=True, index=False))