In [2]:
import pandas as pd
import numpy as np
import datetime
import random

import sagemaker
import sagemaker.session

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString
)

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.functions import Join
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.sklearn.estimator import SKLearn
# import sagemaker_containers

from sagemaker.workflow.pipeline import Pipeline

import os
from sklearn.model_selection import train_test_split
from time import gmtime, strftime, sleep
import boto3
import joblib

In [3]:
session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = 'custom_preprocessing'

timestamp_suffix = strftime("%Y-%m-%d-%H%M%S", gmtime())
folder_name = prefix + '-' + timestamp_suffix
prefix_path = f's3://{bucket}/{folder_name}'

In [4]:
tags = [
    {"Key": "PLATFORM", "Value": "FO-ML"},
    {"Key": "BUSINESS_REGION", "Value": "GLOBAL"},
    {"Key": "BUSINESS_UNIT", "Value": "MOBILITY"},
    {"Key": "CLIENT", "Value": "MULTI_TENANT"}
   ]

## Set up and kick off autopilot job

In [19]:
input_data_config = [
    {
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": "s3://{}/{}/{}/training_data_new".format(bucket, folder_name, "components"),
            }
        },
        "TargetAttributeName": "target",
    }
]

output_data_config = {"S3OutputPath": "s3://{}/{}/{}/autopilot_job_output".format(bucket, folder_name, "components")}

AutoML_Job_Config = {
    "CompletionCriteria": {
        # we set MaxCandidate to 50 to have shorter run time. Please adjust this for your use case.
        "MaxCandidates": 50,
        "MaxAutoMLJobRuntimeInSeconds": 1800,
    }
}

In [20]:
sm = boto3.Session().client(service_name="sagemaker", region_name=region)
timestamp_suffix = strftime("%d-%H-%M-%S", gmtime())

auto_ml_job_name = "automl-test-" + timestamp_suffix
print("AutoMLJobName: " + auto_ml_job_name)

sm.create_auto_ml_job(
    AutoMLJobName=auto_ml_job_name,
    InputDataConfig=input_data_config,
    OutputDataConfig=output_data_config,
    AutoMLJobConfig=AutoML_Job_Config,
    RoleArn=role,
    Tags=tags
)

AutoMLJobName: automl-test-2023-03-01-181930


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:707031497630:automl-job/automl-test-2023-03-01-181930',
 'ResponseMetadata': {'RequestId': '23e59326-ec40-4c0c-bacd-b742e816c1ff',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '23e59326-ec40-4c0c-bacd-b742e816c1ff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '100',
   'date': 'Wed, 01 Mar 2023 18:32:55 GMT'},
  'RetryAttempts': 0}}

In [21]:
print("JobStatus - Secondary Status")
print("------------------------------")


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print(describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"])
job_run_status = describe_response["AutoMLJobStatus"]

while job_run_status not in ("Failed", "Completed", "Stopped"):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response["AutoMLJobStatus"]

    print(
        describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"]
    )
    sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProg

## Results

In [22]:
from IPython.display import JSON

best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)["BestCandidate"]
best_candidate_name = best_candidate["CandidateName"]

print("\n")
print("CandidateName: " + best_candidate_name)
print("CandidateName Steps: " + best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"])
print(
    "FinalAutoMLJobObjectiveMetricName: "
    + best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"]
)
print(
    "FinalAutoMLJobObjectiveMetricValue: "
    + str(best_candidate["FinalAutoMLJobObjectiveMetric"]["Value"])
)



CandidateName: automl-test-2023-03-01-181930fhh-009-b8c2bea3
CandidateName Steps: validation:binary_f_beta
FinalAutoMLJobObjectiveMetricName: validation:binary_f_beta
FinalAutoMLJobObjectiveMetricValue: 1.0


In [36]:
print(best_candidate["CandidateName"])

automl-test-2023-03-01-181930fhh-009-b8c2bea3


In [39]:
sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)

{'AutoMLJobName': 'automl-test-2023-03-01-181930',
 'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:707031497630:automl-job/automl-test-2023-03-01-181930',
 'InputDataConfig': [{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://sagemaker-us-east-1-707031497630/custom_preprocessing-2023-03-01-181930/components/training_data_new'}},
   'TargetAttributeName': 'target',
   'ContentType': 'text/csv;header=present',
   'ChannelType': 'training'}],
 'OutputDataConfig': {'S3OutputPath': 's3://sagemaker-us-east-1-707031497630/custom_preprocessing-2023-03-01-181930/components/autopilot_job_output'},
 'RoleArn': 'arn:aws:iam::707031497630:role/service-role/AmazonSageMaker-ExecutionRole-20220218T020780',
 'AutoMLJobConfig': {'CompletionCriteria': {'MaxCandidates': 50,
   'MaxAutoMLJobRuntimeInSeconds': 1800}},
 'CreationTime': datetime.datetime(2023, 3, 1, 18, 32, 54, 818000, tzinfo=tzlocal()),
 'EndTime': datetime.datetime(2023, 3, 1, 19, 2, 58, 619000, tzinfo=tzlocal()),

In [27]:
# sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
# sm.list_auto_ml_jobs()
sm_dict = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name)

In [33]:
for item in sm_dict["Candidates"]:
    if item['ObjectiveStatus'] != 'Failed':
        print(item["CandidateName"], item["FinalAutoMLJobObjectiveMetric"])
        print(item["InferenceContainers"][1]["Image"], "\n")

automl-test-2023-03-01-181930fhh-030-ed7fda56 {'MetricName': 'validation:f1_binary', 'Value': 1.0, 'StandardMetricName': 'F1'}
683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.3-1-cpu-py3 

automl-test-2023-03-01-181930fhh-026-80743015 {'MetricName': 'validation:binary_f_beta', 'Value': 0.5853658318519592, 'StandardMetricName': 'F1'}
382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:inference-cpu 



In [35]:
for item in sm_dict["Candidates"]:
    print(item["CandidateName"])

automl-test-2023-03-01-181930fhh-036-a5e40394
automl-test-2023-03-01-181930fhh-034-5d7e7eca
automl-test-2023-03-01-181930fhh-035-695ec68d
automl-test-2023-03-01-181930fhh-032-efb3e95c
automl-test-2023-03-01-181930fhh-033-d66cfadc
automl-test-2023-03-01-181930fhh-031-4e48d4fd
automl-test-2023-03-01-181930fhh-029-8594868d
automl-test-2023-03-01-181930fhh-030-ed7fda56
automl-test-2023-03-01-181930fhh-028-b4b230b4
automl-test-2023-03-01-181930fhh-026-80743015


In [32]:
for item in sm_dict["Candidates"]:
    print(item, "\n")

{'CandidateName': 'automl-test-2023-03-01-181930fhh-036-a5e40394', 'ObjectiveStatus': 'Failed', 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:707031497630:processing-job/automl-test-2023-03-01-181930-db-1-1a19b54775c04aac80fbd6428ef0', 'CandidateStepName': 'automl-test-2023-03-01-181930-db-1-1a19b54775c04aac80fbd6428ef0'}, {'CandidateStepType': 'AWS::SageMaker::TrainingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:707031497630:training-job/automl-test-2023-03-01-181930-dpp0-1-c296366c9d7f4f1c9fee7581e5', 'CandidateStepName': 'automl-test-2023-03-01-181930-dpp0-1-c296366c9d7f4f1c9fee7581e5'}, {'CandidateStepType': 'AWS::SageMaker::TransformJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:707031497630:transform-job/automl-test-2023-03-01-181930-dpp0-csv-1-408e46c4c8a74757b6755a', 'CandidateStepName': 'automl-test-2023-03-01-181930-dpp0-csv-1-408e46c4c8a74757b6755a'}, {'CandidateStepType': 'AWS::Sa

# Test Autopilot model

Does it matter that the passed features don't have the target value for inference?

- Original dataset: 5 rows, 10 columns. No target (Used this to test transformer model)
- Processed dataset: 5 rows, 28 columns. No target (Use this to test Autopilot model)
- new_df: 500 rows, 29 columns. Yes target (Used this to train the Autopilot model)

Put label name into .py script. Use that to determine if label is in the dataset. (Example in [Inference Pipeline with Scikit-learn and Linear Learner](https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/scikit_learn_inference_pipeline/Inference%20Pipeline%20with%20Scikit-learn%20and%20Linear%20Learner.ipynb))