In [33]:
import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload Data to S3

In [34]:
bucket_name = 'pmitest-sagemaker'
training_file_key = 'biketrain3/bike_train_numeric_columns.recordio'

s3_model_output_location = r's3://{0}/biketrain3/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)

In [35]:
print(s3_model_output_location)
print(s3_training_file_location)

s3://pmitest-sagemaker/biketrain3/model
s3://pmitest-sagemaker/biketrain3/bike_train_numeric_columns.recordio


In [36]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [37]:
write_to_s3('bike_train_numeric_columns.recordio',bucket_name,training_file_key)

## Training Algorithm Docker Image
### AWS Maintains a separate image for every region and algorithm

In [38]:
# Registry Path for algorithms provided by SageMaker
#  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/pca:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/pca:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/pca:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/pca:latest',
              'eu-central-1': '813361260812.dkr.ecr.eu-central-1.amazonaws.com/xgboost:latest'}

In [39]:
role = get_execution_role()

In [40]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::239979694630:role/service-role/AmazonSageMaker-ExecutionRole-20190206T180766


## Build Model

In [41]:
sess = sagemaker.Session()

In [42]:
containers[boto3.Session().region_name]

'813361260812.dkr.ecr.eu-central-1.amazonaws.com/xgboost:latest'

In [43]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='pca-biketrain-v1')

In [44]:
# Specify hyper parameters that appropriate for the training algorithm
estimator.set_hyperparameters(feature_dim=4,
                        num_components=3,
                        subtract_mean=False,
                        algorithm_mode='regular',
                        mini_batch_size=200,
                        num_round=150)

In [45]:
estimator.hyperparameters()

{'feature_dim': 4,
 'num_components': 3,
 'subtract_mean': False,
 'algorithm_mode': 'regular',
 'mini_batch_size': 200,
 'num_round': 150}

### Train the model

In [46]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':s3_training_file_location})

INFO:sagemaker:Creating training-job with name: pca-biketrain-v1-2019-02-07-14-04-26-724


2019-02-07 14:04:26 Starting - Starting the training job...
2019-02-07 14:04:28 Starting - Launching requested ML instances......
2019-02-07 14:05:34 Starting - Preparing the instances for training......
2019-02-07 14:06:53 Downloading - Downloading input data...
2019-02-07 14:07:19 Training - Training image download completed. Training in progress.
[31mArguments: train[0m
[31m[2019-02-07:14:07:20:INFO] Running standalone xgboost training.[0m
[31m[2019-02-07:14:07:20:INFO] Path /opt/ml/input/data/validation does not exist![0m
[31m[2019-02-07:14:07:20:INFO] File size need to be processed in the node: 0.58mb. Available memory size in the node: 8420.86mb[0m
[31m[2019-02-07:14:07:20:ERROR] Customer Error: Blankspace and colon not found in firstline '[0m
[31m...' of file 'bike_train_numeric_columns.recordio'. ContentType by defaullt is in libsvm. Please ensure the file is in libsvm format.[0m
[31mTraceback (most recent call last):
  File "/opt/amazon/lib/python2.7/site-packages

ValueError: Error for Training job pca-biketrain-v1-2019-02-07-14-04-26-724: Failed Reason: ClientError: Blankspace and colon not found in firstline '
...' of file 'bike_train_numeric_columns.recordio'. ContentType by defaullt is in libsvm. Please ensure the file is in libsvm format.

## Deploy Model

In [None]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'pca-biketrain-v1')

## Run Predictions

In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = json_deserializer

In [None]:
predictor.predict([[-1.333660693,-1.092736969,0.993213054,1.567753667]])

## Summary

1. Ensure Training, Test and Validation data are in S3 Bucket
2. Select Algorithm Container Registry Path - Path varies by region
3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location
4. Specify algorithm specific hyper parameters
5. Train model
6. Deploy model - Specify instance count, instance type and endpoint name
7. Run Predictions