Adapted from https://github.com/awslabs/amazon-sagemaker-examples/blob/master/advanced_functionality/scikit_bring_your_own/scikit_bring_your_own.ipynb

This shows how to use a h2o3-automl docker container and deploy it onto Amazon Sagemaker

In [None]:
# Basic set up
import boto3
import re

import os
import numpy as np
import pandas as pd

from sagemaker import get_execution_role

role = get_execution_role()

In [None]:
import sagemaker as sage
from time import gmtime, strftime

sess = sage.Session()

In [None]:
account = sess.boto_session.client('sts').get_caller_identity()['Account']
region = sess.boto_session.region_name

#Note - make sure to include the Docker image tag (eg, :latest), since there seem to 
# be some issues with deploying a model if you don't include the tag. 
image = '{}.dkr.ecr.{}.amazonaws.com/h2o-automl:latest'.format(account, region)

automl = sage.estimator.Estimator(image, 
                                 role, 1, 
                                 'ml.c4.2xlarge',
                                 output_path='s3://{}/output'.format(sess.default_bucket()),
                                 sagemaker_session=sess)

# The current Docker image needs both training and testing data, 
# so they're specified in two different channels. It's assumed 
# that the user has already uploaded the required data into a couple of
# different directories, and this dictionary just specifies where the 
# training and testing data are, respectively. 
data_location = {'training':'s3://sagemaker-test-bucket-2018/h2o-automl-test-train/training',
                 'testing':'s3://sagemaker-test-bucket-2018/h2o-automl-test-train/testing'} 


In [None]:
# Run AutoML. This can take a little while, and you will likely want to make
# sure that the cell runs entirely before you try deploying a predictor - 
# especially because you will have partial output even during a run
automl.fit(data_location)

In [None]:
# Deplying an actual predictor, so that we can make predictions on test data here
from sagemaker.predictor import csv_serializer
predictor = automl.deploy(1, 'ml.m4.xlarge', serializer=csv_serializer)

In [None]:
# This is just a simple way to try out a prediction on a particular set of data
# The output's currently in the form of a prediction and the class probabilities for each of the classes

sample_data = np.asarray([['age', ' workclass', ' fnlwgt', ' education', ' education-num', ' marital-status', ' occupation', ' relationship', ' race', ' sex', ' capital-gain', ' capital-loss'], 
                          ['68', ' Self-emp-not-inc', ' 273088', ' Some-college', ' 10', ' Married-civ-spouse', ' Craft-repair', ' Husband', ' White', ' Male', ' 0', ' 0']])
print(predictor.predict(sample_data).decode('utf-8'))


In [None]:
# Cleaning up the endpoint, to avoid getting charged for it unnecessarily 

sess.delete_endpoint(predictor.endpoint)