In [1]:
import numpy as np
import pandas as pd

import boto3
import re

import sagemaker
from sagemaker import get_execution_role

## Upload Data to S3

In [2]:
# Specify your bucket name
bucket_name = 'lifa08-ml-sagemaker'

training_folder = r'ashrae/training/'
validation_folder = r'ashrae/validation/'
test_folder = r'ashrae/test/'

s3_model_output_location = r's3://{0}/ashrae/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)

In [3]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://lifa08-ml-sagemaker/ashrae/model
s3://lifa08-ml-sagemaker/ashrae/training/
s3://lifa08-ml-sagemaker/ashrae/validation/
s3://lifa08-ml-sagemaker/ashrae/test/


## Training Algorithm Docker Image

In [4]:
# Establish a session with AWS
sess = sagemaker.Session()

In [5]:
role = get_execution_role()

In [6]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::546778850930:role/service-role/AmazonSageMaker-ExecutionRole-20191206T232794


In [7]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(
    sess.boto_region_name,
    "xgboost", 
    "latest")

print('Using SageMaker XGBoost container:\n{} ({})'.format(container, sess.boto_region_name))

	get_image_uri(region, 'xgboost', '0.90-1').


Using SageMaker XGBoost container:
811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest (us-east-1)


## Build Model

In [8]:
estimator = sagemaker.estimator.Estimator(
    container,
    role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.10xlarge',
    output_path=s3_model_output_location,
    sagemaker_session=sess,
    base_job_name ='xgboost-ashrae-v1')

In [9]:
estimator.set_hyperparameters(max_depth=60,
                              objective="reg:linear",
                              eta=0.1,
                              num_round=6)

In [10]:
estimator.hyperparameters()

{'max_depth': 60, 'objective': 'reg:linear', 'eta': 0.1, 'num_round': 6}

### Specify Training Data Location and Optionally, Validation Data Location

In [11]:
# content type can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.s3_input(
    s3_data=s3_training_file_location,
    content_type='csv',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.s3_input(
    s3_data=s3_validation_file_location,
    content_type='csv',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}

In [12]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://lifa08-ml-sagemaker/ashrae/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://lifa08-ml-sagemaker/ashrae/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


### Train the model

In [13]:
estimator.fit(data_channels)

2020-01-11 08:18:47 Starting - Starting the training job...
2020-01-11 08:18:48 Starting - Launching requested ML instances......
2020-01-11 08:19:54 Starting - Preparing the instances for training...
2020-01-11 08:20:37 Downloading - Downloading input data...
2020-01-11 08:21:14 Training - Downloading the training image..[34mArguments: train[0m
[34m[2020-01-11:08:21:30:INFO] Running standalone xgboost training.[0m
[34m[2020-01-11:08:21:30:INFO] File size need to be processed in the node: 1198.19mb. Available memory size in the node: 152546.29mb[0m
[34m[2020-01-11:08:21:30:INFO] Determined delimiter of CSV input is ','[0m
[34m[08:21:30] S3DistributionType set as FullyReplicated[0m

2020-01-11 08:21:28 Training - Training image download completed. Training in progress.[34m[08:21:33] 13618907x9 matrix with 122570163 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-01-11:08:21:33:INFO] Determined delimiter of CSV input is ','[0m

## Deploy Model

In [15]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.10xlarge',
                             endpoint_name = 'xgboost-ashrae-v5')



---------------------------------------------------------------------------------------------------------------!

## Run Predictions

In [16]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [17]:
# Example test data for row_id = 0
predictor.predict([[0,0,8.913685,17.8,4.0,11.7,0.0,0,6]])

b'0.804454624653'