In [1]:
import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

In [2]:
bucket_name = 'rajesh-ml-sagemaker'
training_file_key = 'housing/train.csv'
validation_file_key = 'housing/validation.csv'
test_file_key = 'housing/test.csv'

s3_model_output_location = r's3://{0}/housing/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_file_key)

In [4]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://rajesh-ml-sagemaker/housing/model
s3://rajesh-ml-sagemaker/housing/train.csv
s3://rajesh-ml-sagemaker/housing/validation.csv
s3://rajesh-ml-sagemaker/housing/test.csv


In [5]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [6]:
write_to_s3('train_prep.csv',bucket_name,training_file_key)
write_to_s3('validation_prep.csv',bucket_name,validation_file_key)
write_to_s3('test_prep.csv',bucket_name,test_file_key)

In [54]:
# Registry Path for algorithms provided by SageMaker
#  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest'}

In [55]:
role = get_execution_role()

In [56]:
print(role)

arn:aws:iam::828396155196:role/service-role/AmazonSageMaker-ExecutionRole-20190926T223473


In [57]:
sess = sagemaker.Session()

In [58]:
print(boto3.Session().region_name)

us-east-1


In [59]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name
# get the URI for new container
'''
container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='0.90-1')
estimator = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='xgboost-housing-v1')
'''
estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='xgboost-housing-v1')

In [60]:
# Specify hyper parameters that appropriate for the training algorithm
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

# max_depth=5,eta=0.1,subsample=0.7,num_round=150
estimator.set_hyperparameters(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.4313, eta=0.0466, gamma=0.0469,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1.5872, missing=None, n_estimators=520,
       n_jobs=1, num_round=520, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=77, silent=0, subsample=0.535, verbosity=1)

In [61]:
estimator.hyperparameters()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 0.4313,
 'eta': 0.0466,
 'gamma': 0.0469,
 'importance_type': 'gain',
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1.5872,
 'missing': None,
 'n_estimators': 520,
 'n_jobs': 1,
 'num_round': 520,
 'objective': 'reg:linear',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 77,
 'silent': 0,
 'subsample': 0.535,
 'verbosity': 1}

In [62]:
# content type can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.s3_input(s3_data=s3_training_file_location,content_type="csv")
validation_input_config = sagemaker.session.s3_input(s3_data=s3_validation_file_location,content_type="csv")

In [63]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://rajesh-ml-sagemaker/housing/train.csv'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://rajesh-ml-sagemaker/housing/validation.csv'}}, 'ContentType': 'csv'}


In [64]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':training_input_config, 'validation':validation_input_config})

2019-10-03 00:54:07 Starting - Starting the training job...
2019-10-03 00:54:09 Starting - Launching requested ML instances......
2019-10-03 00:55:12 Starting - Preparing the instances for training......
2019-10-03 00:56:15 Downloading - Downloading input data...
2019-10-03 00:57:06 Training - Training image download completed. Training in progress..[31mArguments: train[0m
[31m[2019-10-03:00:57:07:INFO] Running standalone xgboost training.[0m
[31m[2019-10-03:00:57:07:INFO] File size need to be processed in the node: 1.07mb. Available memory size in the node: 8603.45mb[0m
[31m[2019-10-03:00:57:07:INFO] Determined delimiter of CSV input is ','[0m
[31m[00:57:07] S3DistributionType set as FullyReplicated[0m
[31m[00:57:07] 1008x302 matrix with 304416 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-10-03:00:57:07:INFO] Determined delimiter of CSV input is ','[0m
[31m[00:57:07] S3DistributionType set as FullyReplicated[0m
[31m[0


2019-10-03 00:57:18 Uploading - Uploading generated training model
2019-10-03 00:57:18 Completed - Training job completed
Training seconds: 63
Billable seconds: 63
