In [2]:
import pandas as pd
import numpy as np
import datetime
import random

import sagemaker
import sagemaker.session

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString
)

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.functions import Join
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.sklearn.estimator import SKLearn
# import sagemaker_containers

from sagemaker.workflow.pipeline import Pipeline

import os
from sklearn.model_selection import train_test_split
from time import gmtime, strftime, sleep
import boto3
import joblib

In [3]:
session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = 'custom_preprocessing'

timestamp_suffix = strftime("%Y-%m-%d-%H%M%S", gmtime())
folder_name = prefix + '-' + timestamp_suffix
prefix_path = f's3://{bucket}/{folder_name}'

In [4]:
tags = [
    {"Key": "PLATFORM", "Value": "FO-ML"},
    {"Key": "BUSINESS_REGION", "Value": "GLOBAL"},
    {"Key": "BUSINESS_UNIT", "Value": "MOBILITY"},
    {"Key": "CLIENT", "Value": "MULTI_TENANT"}
   ]

## Create sample data

In [5]:
tf_vals = ['true', 'false', np.nan, '1', '0']
onehot_vals = [np.nan, 'purple', 'orange', 'purple', 'blue']

date_vals = []
for _ in range(4):
    date = datetime.date(2022, random.randint(1, 12), random.randint(1, 31))
    date_vals.append(date)
date_vals.append(np.nan)

float_vals = [3, 8.0, 2, np.nan, 4.0]
list_max_vals = ['3,0,9,4,2', np.nan, '0,2,3,9,8,4', '4', '5,4,3']
list_nunique_vals = ['apple,orange,grape', '0,9,8,3,4,3,3,4,9', np.nan, '4,4,4,4,4', 'pineapple']
descstat_vals = ['9,2,8,3,4', '1', '7,8,9,2,3,4', np.nan, '34']
multi_label_vals = ['apple,orange,grape', 'pineapple,grape,strawberry', np.nan, 'blueberry', 'grapefruit,apple']
drop_vals = [np.nan, 3, 6, 1, np.nan]
x_rand = list(range(5))

sample_df = pd.DataFrame({
    'true_false':tf_vals,
    'one_hot':onehot_vals,
    'dates':date_vals,
    'floats':float_vals,
    'max_of_list':list_max_vals,
    'nunique_of_list':list_nunique_vals,
    'desc_stats':descstat_vals,
    'multi_label':multi_label_vals,
    'random_col':drop_vals,
    'other':x_rand})
sample_df

Unnamed: 0,true_false,one_hot,dates,floats,max_of_list,nunique_of_list,desc_stats,multi_label,random_col,other
0,true,,2022-02-04,3.0,30942.0,"apple,orange,grape",92834.0,"apple,orange,grape",,0
1,false,purple,2022-12-19,8.0,,098343349,1.0,"pineapple,grape,strawberry",3.0,1
2,,orange,2022-05-06,2.0,23984.0,,789234.0,,6.0,2
3,1,purple,2022-07-28,,4.0,44444,,blueberry,1.0,3
4,0,blue,,4.0,543.0,pineapple,34.0,"grapefruit,apple",,4


In [6]:
sample_df.to_csv('sample.csv', index=False)
train_input = session.upload_data('sample.csv', bucket=bucket, key_prefix=folder_name)

## Train Preprocessor

In [7]:
script_path = "processor_script.py"
model_output_path = os.path.join('s3://', bucket, folder_name, "components")

sklearn_transformer = SKLearn(
    entry_point=script_path,
    role=role,
    output_path=model_output_path,
    instance_type="ml.m5.large",
    sagemaker_session=None,
    framework_version="1.0-1",
    py_version="py3",
    tags=tags,
    dependencies=['transformers.py']
)

In [8]:
sklearn_transformer.fit({"train": train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2023-03-01-18-19-37-681


2023-03-01 18:19:38 Starting - Starting the training job.........
2023-03-01 18:21:04 Starting - Preparing the instances for training......
2023-03-01 18:21:52 Downloading - Downloading input data...
2023-03-01 18:22:23 Training - Downloading the training image...
2023-03-01 18:23:09 Uploading - Uploading generated training model.[34m2023-03-01 18:23:04,114 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-03-01 18:23:04,118 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-03-01 18:23:04,127 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-03-01 18:23:04,422 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-03-01 18:23:04,435 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-03-01 18:23:04,447 sagemaker-training-toolkit INFO     No GPUs detected (normal i

In [9]:
transformer_prefix = os.path.join(folder_name,
                                  "components",
                                  sklearn_transformer.latest_training_job.job_name,
                                  "output",
                                  "model.tar.gz")

session.download_data(path='./', bucket=bucket, key_prefix=transformer_prefix)

In [5]:
# transformer_prefix = 'custom_preprocessing-2023-02-27-153524/components/sagemaker-scikit-learn-2023-02-27-15-35-25-466/output/model.tar.gz'
# session.download_data(path='./', bucket=bucket, key_prefix=transformer_prefix)

In [10]:
!tar xvzf model.tar.gz

preprocessor.joblib
feature_names.joblib


In [11]:
feature_list = list(joblib.load("feature_names.joblib"))
print(feature_list)

['true_false', 'one_hot_blue', 'one_hot_ml_empty', 'one_hot_orange', 'one_hot_purple', 'dates-month', 'dates-day_of_week', 'dates-hour', 'dates-day_of_month', 'dates-is_month_start', 'dates-is_month_end', 'floats', 'max_of_list', 'nunique_of_list', 'desc_stats-min', 'desc_stats-max', 'desc_stats-mean', 'desc_stats-std', 'desc_stats-nunique', 'multi_label_apple', 'multi_label_blueberry', 'multi_label_grape', 'multi_label_grapefruit', 'multi_label_ml_empty', 'multi_label_orange', 'multi_label_pineapple', 'multi_label_strawberry', 'other']


In [12]:
len(feature_list)

28

In [13]:
joblib.load("preprocessor.joblib")



ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('drop_cols', 'drop', ['random_col']),
                                ('truefalse', TrueFalseTransformer(),
                                 ['true_false']),
                                ('onehot', OneHotTransformer(), ['one_hot']),
                                ('dates', DateTransformer(), ['dates']),
                                ('floats', FloatTransformer(), ['floats']),
                                ('listmax', ListMaxTransformer(),
                                 ['max_of_list']),
                                ('nunique', ListNuniqueTransformer(),
                                 ['nunique_of_list']),
                                ('descstats', DescStatTransformer(),
                                 ['desc_stats']),
                                ('multilabel', MultilabelTransformer(),
                           

## Batch transform

In [19]:
# train_input = 's3://sagemaker-us-east-1-707031497630/custom_preprocessing-2023-02-27-153524/sample.csv'

In [13]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer_output = os.path.join("s3://", bucket, folder_name, "Feature_selection_output/")
transformer = sklearn_transformer.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=transformer_output,
    assemble_with="Line",
    accept="text/csv",
    role=role,
    tags=tags
)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2023-03-01-18-25-59-289


In [14]:
# Preprocess training input
transformer.transform(train_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

INFO:sagemaker:Creating transform job with name: sagemaker-scikit-learn-2023-03-01-18-26-01-787


.............................[34m2023-03-01 18:30:42,339 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-03-01 18:30:42,342 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-03-01 18:30:42,343 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
 

In [65]:
preprocessed_train

's3://sagemaker-us-east-1-707031497630/custom_preprocessing-2023-02-28-162117/Feature_selection_output/'

## Save batch transformed data

In [15]:
transformer_output_path = os.path.join(transformer.output_path)

key_prefix = (
    transformer_output_path[transformer_output_path.find(bucket) + len(bucket) + 1 :]
    + "sample.csv.out"
)
print(transformer_output_path)

session.download_data(path="./", bucket=bucket, key_prefix=key_prefix)
df_new = pd.read_csv("sample.csv.out", header=None)

# first column is the target variable
df_new.columns = feature_list

s3://sagemaker-us-east-1-707031497630/custom_preprocessing-2023-03-01-181930/Feature_selection_output/


In [16]:
targets = [1,0,0,1,1]
df_new.insert(0, "target", targets)

In [83]:
df_new.head()

Unnamed: 0,target,true_false,one_hot_blue,one_hot_ml_empty,one_hot_orange,one_hot_purple,dates-month,dates-day_of_week,dates-hour,dates-day_of_month,...,desc_stats-nunique,multi_label_apple,multi_label_blueberry,multi_label_grape,multi_label_grapefruit,multi_label_ml_empty,multi_label_orange,multi_label_pineapple,multi_label_strawberry,other
0,1,1.0,0.0,0.0,0.0,0.0,12.0,1.0,0.0,27.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,1.0,12.0,0.0,0.0,19.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
2,0,-1.0,0.0,0.0,1.0,0.0,2.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
3,1,1.0,0.0,0.0,0.0,1.0,9.0,3.0,0.0,15.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
4,1,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0


In [17]:
df_new_500 = df_new.append([df_new]*99, ignore_index=True)

In [18]:
df_new_500.to_csv("train_new.csv", index=False)

train_new_input = session.upload_data(
    path="{}".format("train_new.csv"),
    bucket=bucket,
    key_prefix="{}/{}/{}".format(folder_name, "components", "training_data_new"),
)

df_new_500.head()

Unnamed: 0,target,true_false,one_hot_blue,one_hot_ml_empty,one_hot_orange,one_hot_purple,dates-month,dates-day_of_week,dates-hour,dates-day_of_month,...,desc_stats-nunique,multi_label_apple,multi_label_blueberry,multi_label_grape,multi_label_grapefruit,multi_label_ml_empty,multi_label_orange,multi_label_pineapple,multi_label_strawberry,other
0,1,1.0,0.0,0.0,0.0,0.0,2.0,4.0,0.0,4.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,1.0,12.0,0.0,0.0,19.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
2,0,-1.0,0.0,0.0,1.0,0.0,5.0,4.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
3,1,1.0,0.0,0.0,0.0,1.0,7.0,3.0,0.0,28.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
4,1,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0


## Set up and kick off autopilot job

In [19]:
input_data_config = [
    {
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": "s3://{}/{}/{}/training_data_new".format(bucket, folder_name, "components"),
            }
        },
        "TargetAttributeName": "target",
    }
]

output_data_config = {"S3OutputPath": "s3://{}/{}/{}/autopilot_job_output".format(bucket, folder_name, "components")}

AutoML_Job_Config = {
    "CompletionCriteria": {
        # we set MaxCandidate to 50 to have shorter run time. Please adjust this for your use case.
        "MaxCandidates": 50,
        "MaxAutoMLJobRuntimeInSeconds": 1800,
    }
}

In [20]:
sm = boto3.Session().client(service_name="sagemaker", region_name=region)
# timestamp_suffix = strftime("%d-%H-%M-%S", gmtime())

auto_ml_job_name = "automl-test-" + timestamp_suffix
print("AutoMLJobName: " + auto_ml_job_name)

sm.create_auto_ml_job(
    AutoMLJobName=auto_ml_job_name,
    InputDataConfig=input_data_config,
    OutputDataConfig=output_data_config,
    AutoMLJobConfig=AutoML_Job_Config,
    RoleArn=role,
    Tags=tags
)

AutoMLJobName: automl-test-2023-03-01-181930


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:707031497630:automl-job/automl-test-2023-03-01-181930',
 'ResponseMetadata': {'RequestId': '23e59326-ec40-4c0c-bacd-b742e816c1ff',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '23e59326-ec40-4c0c-bacd-b742e816c1ff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '100',
   'date': 'Wed, 01 Mar 2023 18:32:55 GMT'},
  'RetryAttempts': 0}}

In [21]:
print("JobStatus - Secondary Status")
print("------------------------------")


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print(describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"])
job_run_status = describe_response["AutoMLJobStatus"]

while job_run_status not in ("Failed", "Completed", "Stopped"):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response["AutoMLJobStatus"]

    print(
        describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"]
    )
    sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProg

## Results

In [22]:
from IPython.display import JSON

best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)["BestCandidate"]
best_candidate_name = best_candidate["CandidateName"]

print("\n")
print("CandidateName: " + best_candidate_name)
print("CandidateName Steps: " + best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"])
print(
    "FinalAutoMLJobObjectiveMetricName: "
    + best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"]
)
print(
    "FinalAutoMLJobObjectiveMetricValue: "
    + str(best_candidate["FinalAutoMLJobObjectiveMetric"]["Value"])
)



CandidateName: automl-test-2023-03-01-181930fhh-009-b8c2bea3
CandidateName Steps: validation:binary_f_beta
FinalAutoMLJobObjectiveMetricName: validation:binary_f_beta
FinalAutoMLJobObjectiveMetricValue: 1.0


In [36]:
print(best_candidate["CandidateName"])

automl-test-2023-03-01-181930fhh-009-b8c2bea3


In [39]:
sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)

{'AutoMLJobName': 'automl-test-2023-03-01-181930',
 'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:707031497630:automl-job/automl-test-2023-03-01-181930',
 'InputDataConfig': [{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://sagemaker-us-east-1-707031497630/custom_preprocessing-2023-03-01-181930/components/training_data_new'}},
   'TargetAttributeName': 'target',
   'ContentType': 'text/csv;header=present',
   'ChannelType': 'training'}],
 'OutputDataConfig': {'S3OutputPath': 's3://sagemaker-us-east-1-707031497630/custom_preprocessing-2023-03-01-181930/components/autopilot_job_output'},
 'RoleArn': 'arn:aws:iam::707031497630:role/service-role/AmazonSageMaker-ExecutionRole-20220218T020780',
 'AutoMLJobConfig': {'CompletionCriteria': {'MaxCandidates': 50,
   'MaxAutoMLJobRuntimeInSeconds': 1800}},
 'CreationTime': datetime.datetime(2023, 3, 1, 18, 32, 54, 818000, tzinfo=tzlocal()),
 'EndTime': datetime.datetime(2023, 3, 1, 19, 2, 58, 619000, tzinfo=tzlocal()),

In [27]:
# sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
# sm.list_auto_ml_jobs()
sm_dict = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name)

In [33]:
for item in sm_dict["Candidates"]:
    if item['ObjectiveStatus'] != 'Failed':
        print(item["CandidateName"], item["FinalAutoMLJobObjectiveMetric"])
        print(item["InferenceContainers"][1]["Image"], "\n")

automl-test-2023-03-01-181930fhh-030-ed7fda56 {'MetricName': 'validation:f1_binary', 'Value': 1.0, 'StandardMetricName': 'F1'}
683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.3-1-cpu-py3 

automl-test-2023-03-01-181930fhh-026-80743015 {'MetricName': 'validation:binary_f_beta', 'Value': 0.5853658318519592, 'StandardMetricName': 'F1'}
382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:inference-cpu 



In [35]:
for item in sm_dict["Candidates"]:
    print(item["CandidateName"])

automl-test-2023-03-01-181930fhh-036-a5e40394
automl-test-2023-03-01-181930fhh-034-5d7e7eca
automl-test-2023-03-01-181930fhh-035-695ec68d
automl-test-2023-03-01-181930fhh-032-efb3e95c
automl-test-2023-03-01-181930fhh-033-d66cfadc
automl-test-2023-03-01-181930fhh-031-4e48d4fd
automl-test-2023-03-01-181930fhh-029-8594868d
automl-test-2023-03-01-181930fhh-030-ed7fda56
automl-test-2023-03-01-181930fhh-028-b4b230b4
automl-test-2023-03-01-181930fhh-026-80743015


In [32]:
for item in sm_dict["Candidates"]:
    print(item, "\n")

{'CandidateName': 'automl-test-2023-03-01-181930fhh-036-a5e40394', 'ObjectiveStatus': 'Failed', 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:707031497630:processing-job/automl-test-2023-03-01-181930-db-1-1a19b54775c04aac80fbd6428ef0', 'CandidateStepName': 'automl-test-2023-03-01-181930-db-1-1a19b54775c04aac80fbd6428ef0'}, {'CandidateStepType': 'AWS::SageMaker::TrainingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:707031497630:training-job/automl-test-2023-03-01-181930-dpp0-1-c296366c9d7f4f1c9fee7581e5', 'CandidateStepName': 'automl-test-2023-03-01-181930-dpp0-1-c296366c9d7f4f1c9fee7581e5'}, {'CandidateStepType': 'AWS::SageMaker::TransformJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:707031497630:transform-job/automl-test-2023-03-01-181930-dpp0-csv-1-408e46c4c8a74757b6755a', 'CandidateStepName': 'automl-test-2023-03-01-181930-dpp0-csv-1-408e46c4c8a74757b6755a'}, {'CandidateStepType': 'AWS::Sa

# Test Autopilot model

Does it matter that the passed features don't have the target value for inference?

- Original dataset: 5 rows, 10 columns. No target (Used this to test transformer model)
- Processed dataset: 5 rows, 28 columns. No target (Use this to test Autopilot model)
- new_df: 500 rows, 29 columns. Yes target (Used this to train the Autopilot model)

Put label name into .py script. Use that to determine if label is in the dataset. (Example in [Inference Pipeline with Scikit-learn and Linear Learner](https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/scikit_learn_inference_pipeline/Inference%20Pipeline%20with%20Scikit-learn%20and%20Linear%20Learner.ipynb))

## Set up the inference pipeline

In [24]:
sklearn_transformer.latest_training_job.describe()["HyperParameters"][
    "sagemaker_submit_directory"
][1:-1]

's3://sagemaker-us-east-1-707031497630/sagemaker-scikit-learn-2023-03-01-18-19-37-681/source/sourcedir.tar.gz'

In [25]:
best_candidate["InferenceContainers"]

[{'Image': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-sklearn-automl:2.5-1-cpu-py3',
  'ModelDataUrl': 's3://sagemaker-us-east-1-707031497630/custom_preprocessing-2023-03-01-181930/components/autopilot_job_output/automl-test-2023-03-01-181930/data-processor-models/automl-test-2023-03-01-181930-dpp5-1-aecb2b7e2fb74785be17c41913/output/model.tar.gz',
  'Environment': {'AUTOML_TRANSFORM_MODE': 'feature-transform',
   'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT': 'application/x-recordio-protobuf',
   'SAGEMAKER_PROGRAM': 'sagemaker_serve',
   'SAGEMAKER_SUBMIT_DIRECTORY': '/opt/ml/model/code'}},
 {'Image': '382416733822.dkr.ecr.us-east-1.amazonaws.com/mxnet-algorithms:inference-cpu',
  'ModelDataUrl': 's3://sagemaker-us-east-1-707031497630/custom_preprocessing-2023-03-01-181930/components/autopilot_job_output/automl-test-2023-03-01-181930/tuning/automl-tes-dpp5-mlp/automl-test-2023-03-01-181930fhh-009-b8c2bea3/output/model.tar.gz',
  'Environment': {'MAX_CONTENT_LENGTH': '20971520',

In [26]:
from botocore.exceptions import ClientError

sagemaker = boto3.client("sagemaker")
import time
from datetime import datetime

# time_stamp = datetime.now().strftime("%m-%d-%Y-%I-%M-%S-%p")
# timestamp_suffix

pipeline_name = "pipeline-test-" + timestamp_suffix
pipeline_endpoint_config_name = "pipeline-test-endpoint-config-" + timestamp_suffix
pipeline_endpoint_name = "pipeline-test-endpoint-" + timestamp_suffix

sklearn_image = sklearn_transformer.image_uri
container_1_source = sklearn_transformer.latest_training_job.describe()["HyperParameters"][
    "sagemaker_submit_directory"
][1:-1]
inference_containers = [
    {
        "Image": sklearn_image,
        "ModelDataUrl": sklearn_transformer.model_data,
        "Environment": {
            "SAGEMAKER_SUBMIT_DIRECTORY": container_1_source,
            "SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv",
            "SAGEMAKER_PROGRAM": "sklearn_feature_selection.py",
        },
    }
]

inference_containers.extend(best_candidate["InferenceContainers"])

response = sagemaker.create_model(
    ModelName=pipeline_name, Containers=inference_containers, ExecutionRoleArn=role
)

In [None]:
try:
    response = sagemaker.create_endpoint_config(
        EndpointConfigName=pipeline_endpoint_config_name,
        ProductionVariants=[
            {
                "VariantName": "DefaultVariant",
                "ModelName": pipeline_name,
                "InitialInstanceCount": 1,
                "InstanceType": "ml.m4.xlarge",
            },
        ],
    )
    print("{}\n".format(response))

except ClientError:
    print("Endpoint config already exists, continuing...")


try:
    response = sagemaker.create_endpoint(
        EndpointName=pipeline_endpoint_name,
        EndpointConfigName=pipeline_endpoint_config_name,
    )
    print("{}\n".format(response))

except ClientError:
    print("Endpoint already exists, continuing...")


# Monitor the status until completed
endpoint_status = sagemaker.describe_endpoint(EndpointName=pipeline_endpoint_name)["EndpointStatus"]
while endpoint_status not in ("OutOfService", "InService", "Failed"):
    endpoint_status = sagemaker.describe_endpoint(EndpointName=pipeline_endpoint_name)[
        "EndpointStatus"
    ]
    print(endpoint_status)
    time.sleep(30)

{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:707031497630:endpoint-config/pipeline-test-endpoint-config-2023-03-01-181930', 'ResponseMetadata': {'RequestId': 'cbd4d409-9140-419d-b2b0-e0f3cd79e662', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'cbd4d409-9140-419d-b2b0-e0f3cd79e662', 'content-type': 'application/x-amz-json-1.1', 'content-length': '128', 'date': 'Wed, 01 Mar 2023 20:43:13 GMT'}, 'RetryAttempts': 0}}

{'EndpointArn': 'arn:aws:sagemaker:us-east-1:707031497630:endpoint/pipeline-test-endpoint-2023-03-01-181930', 'ResponseMetadata': {'RequestId': '79cc6a1f-021d-4a59-9044-175cb445093a', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '79cc6a1f-021d-4a59-9044-175cb445093a', 'content-type': 'application/x-amz-json-1.1', 'content-length': '108', 'date': 'Wed, 01 Mar 2023 20:43:13 GMT'}, 'RetryAttempts': 0}}

Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating


In [55]:
sm_model.deploy?

[0;31mSignature:[0m
[0msm_model[0m[0;34m.[0m[0mdeploy[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0minitial_instance_count[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minstance_type[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mserializer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdeserializer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mendpoint_name[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtags[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwait[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mupdate_endpoint[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_capture_config[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkms_key[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvolume_size[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmo

In [59]:
sklearn_transformer.create_model?

[0;31mSignature:[0m
[0msklearn_transformer[0m[0;34m.[0m[0mcreate_model[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmodel_server_workers[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrole[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvpc_config_override[0m[0;34m=[0m[0;34m'VPC_CONFIG_DEFAULT'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mentry_point[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msource_dir[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdependencies[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Create a SageMaker ``SKLearnModel`` object that can be deployed to an ``Endpoint``.

Args:
    model_server_workers (int): Optional. The number of worker processes
        used by the inference server. If None, server wil

In [56]:
scikit_learn_inference_model = sklearn_transformer.create_model()

In [58]:
model_arn = sm.create_model(
    Containers=best_candidate["InferenceContainers"], ModelName=model_name, ExecutionRoleArn=role
)

ClientError: An error occurred (ValidationException) when calling the CreateModel operation: Cannot create already existing model "arn:aws:sagemaker:us-east-1:707031497630:model/automl-test-2023-03-01-181930fhh-009-b8c2bea3-v5".

In [61]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
inference_response_keys = ["predicted_label", "probability"]

scikit_learn_inference_model = sklearn_transformer.create_model()

model_name = best_candidate_name + '-v5'
# model_arn = sm.create_model(
#     Containers=best_candidate["InferenceContainers"], ModelName=model_name, ExecutionRoleArn=role
# )
autopilot_model = sm.create_model(
    ModelName=best_candidate_name,
    candidate=best_candidate,
    inference_response_keys=inference_response_keys,
    ExecutionRoleArn=role)

model_pipe_name = "inference-pipeline-" + timestamp_suffix
endpoint_name = "inference-pipeline-ep-" + timestamp_suffix
sm_model = PipelineModel(
    name=model_pipe_name,
    role=role,
    models=[scikit_learn_inference_model, autopilot_model]
)

sm_model.deploy(
    initial_instance_count=1,
    instance_type="ml.c4.xlarge",
    endpoint_name=endpoint_name,
    tags=tags)

ParamValidationError: Parameter validation failed:
Unknown parameter in input: "candidate", must be one of: ModelName, PrimaryContainer, Containers, InferenceExecutionConfig, ExecutionRoleArn, Tags, VpcConfig, EnableNetworkIsolation
Unknown parameter in input: "inference_response_keys", must be one of: ModelName, PrimaryContainer, Containers, InferenceExecutionConfig, ExecutionRoleArn, Tags, VpcConfig, EnableNetworkIsolation

## Make a request to our pipeline endpoint

In [None]:
test_data = sample_df.iloc[0:5, :-1]
print(test_data)

  true_false one_hot       dates  floats  max_of_list     nunique_of_list  \
0       true     NaN  2022-12-27     3.0    3,0,9,4,2  apple,orange,grape   
1      false  purple  2022-12-19     8.0          NaN   0,9,8,3,4,3,3,4,9   
2        NaN  orange  2022-02-04     2.0  0,2,3,9,8,4                 NaN   
3          1  purple  2022-09-15     NaN            4           4,4,4,4,4   
4          0    blue         NaN     4.0        5,4,3           pineapple   

    desc_stats                 multi_label  random_col  
0    9,2,8,3,4          apple,orange,grape         NaN  
1            1  pineapple,grape,strawberry         3.0  
2  7,8,9,2,3,4                         NaN         6.0  
3          NaN                   blueberry         1.0  
4           34            grapefruit,apple         NaN  


In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import IdentitySerializer
from sagemaker.deserializers import CSVDeserializer

predictor = Predictor(
    endpoint_name=pipeline_endpoint_name,
    sagemaker_session=session,
    serializer=IdentitySerializer(content_type="text/csv"),
    deserializer=CSVDeserializer(),
)

predictor.predict(test_data.to_csv(sep=",", header=True, index=False))

## Delete Endpoint

In [96]:
sm_client = session.boto_session.client("sagemaker")
sm_client.delete_endpoint(EndpointName=pipeline_endpoint_name)

{'ResponseMetadata': {'RequestId': '96af0047-2d99-43fe-98be-8e4da4205e17',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '96af0047-2d99-43fe-98be-8e4da4205e17',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Tue, 28 Feb 2023 21:46:10 GMT'},
  'RetryAttempts': 0}}