In [5]:
%matplotlib inline

import sys
import json
import random
import datetime
import os

import boto3
import sagemaker
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sagemaker.amazon.amazon_estimator import get_image_uri

In [None]:
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

Load the consolidated data and drop some unuseful columns

In [16]:
df = pd.read_csv("../data/df_combined.csv", index_col=0)
df.drop(
    [
        "ens mean", 
        "ens max", 
        "ens min", 
        "ens std", 
        "lat mean", 
        "lat std", 
        "lat max",
        "lon mean", 
        "lon std", 
        "lon max",
        "intTime mean", 
        "intTime max", 
        "intTime min", 
        "intTime std",
        "intValidTime mean", 
        "intValidTime max", 
        "intValidTime min", 
        "intValidTime std"
    ], 
    axis=1,
    inplace=True
)
df.rename(columns={"lat min": "gefs_lat", "lon min": "gefs_lon"}, inplace=True)
df.head()

Unnamed: 0,station,time,gefs_lat,gefs_lon,Minimum_temperature mean,Minimum_temperature max,Minimum_temperature min,Minimum_temperature std,Maximum_temperature mean,Maximum_temperature max,...,Downward_Long-Wave_Rad_Flux std,Upward_Long-Wave_Rad_Flux_surface mean,Upward_Long-Wave_Rad_Flux_surface max,Upward_Long-Wave_Rad_Flux_surface min,Upward_Long-Wave_Rad_Flux_surface std,Total_precipitation mean,Total_precipitation max,Total_precipitation min,Total_precipitation std,solaroutput
0,ACME,1994-01-01,35.0,262.0,280.2606,286.53845,277.96167,3.016652,284.39703,288.03134,...,10.202597,364.31976,405.96344,334.0,26.315298,0.0,0.0,0.0,0.0,12384900
1,ACME,1994-01-02,35.0,262.0,279.7873,286.3844,276.45343,3.498222,283.7927,288.81647,...,23.408524,358.0753,401.507,328.2113,25.584993,0.031273,0.42,0.0,0.083112,11908500
2,ACME,1994-01-03,35.0,262.0,273.05588,278.22974,269.2065,3.634358,277.42554,282.6646,...,14.07812,335.1684,379.66568,299.022,30.185274,0.0,0.0,0.0,0.0,12470700
3,ACME,1994-01-04,35.0,262.0,275.1715,278.388,272.8698,1.632241,278.94736,281.91614,...,15.885374,342.49518,370.78842,318.54144,16.494534,0.000909,0.05,0.0,0.006742,12725400
4,ACME,1994-01-05,35.0,262.0,278.55487,285.22772,273.70966,5.200747,283.422,292.34055,...,19.956627,353.97708,402.53543,314.00162,36.71089,0.0,0.0,0.0,0.0,10894800


Let's encode the station variable, since it's a categorical variable here. This way, we'll train a regressor for all the solar sites at once.

In [17]:
df = pd.get_dummies(df, columns=['station'])
df.head()

Unnamed: 0,time,gefs_lat,gefs_lon,Minimum_temperature mean,Minimum_temperature max,Minimum_temperature min,Minimum_temperature std,Maximum_temperature mean,Maximum_temperature max,Maximum_temperature min,...,station_VINI,station_WASH,station_WATO,station_WAUR,station_WEAT,station_WEST,station_WILB,station_WIST,station_WOOD,station_WYNO
0,1994-01-01,35.0,262.0,280.2606,286.53845,277.96167,3.016652,284.39703,288.03134,279.33362,...,0,0,0,0,0,0,0,0,0,0
1,1994-01-02,35.0,262.0,279.7873,286.3844,276.45343,3.498222,283.7927,288.81647,277.86566,...,0,0,0,0,0,0,0,0,0,0
2,1994-01-03,35.0,262.0,273.05588,278.22974,269.2065,3.634358,277.42554,282.6646,270.6221,...,0,0,0,0,0,0,0,0,0,0
3,1994-01-04,35.0,262.0,275.1715,278.388,272.8698,1.632241,278.94736,281.91614,275.8865,...,0,0,0,0,0,0,0,0,0,0
4,1994-01-05,35.0,262.0,278.55487,285.22772,273.70966,5.200747,283.422,292.34055,274.21594,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df["time"].max()

'2007-12-31'

Let's move the solaroutput column to be the first one. The XGBoost regressor expects the first column to be the target to predict, with the remaining/following columns being the features; we'll simulaneously remove the gefs lat and lon coordinates as well, since they aren't useful features here.

In [19]:
dataset_df = pd.concat(
    [df['solaroutput'], df.drop(['solaroutput', 'gefs_lat', 'gefs_lon'], axis=1)], 
    axis=1
) 
dataset_df.head()

For algorithms that rely on distance metrics, we'd normally preprocess the feature values by standardizing them with something like scikit-learn's StandardScaler. Since the XGBoost algorithm relies fundamentally on decision trees, feature scaling isn't necessary here. XGBoost also naturally handles missing data values as well. 

Split the data into a training and a validation set. Data before 2007-01-01 will form the training dataset, while data after will be the validation data. 

In [54]:
split_time = '2007-01-01'
training_dataset_df = dataset_df[dataset_df["time"] < split_time].copy(deep=True)
validation_dataset_df = dataset_df[dataset_df["time"] >= split_time].copy(deep=True)

Let's write the data to csv files and then send it to the relevant S3 bucket for retrieval by SageMaker during training and tuning.

In [40]:
training_dataset_df.to_csv('../data/xgboost_train.csv', header=False, index=False)
validation_dataset_df.to_csv('../data/xgboost_validation.csv', header=False, index=False)

bucket = 'aws-brightidea'
prefix = 'scientist_12/xgboost_regressor_mesonet_solaroutput'

s3_input_train = (
    boto3.Session().resource('s3')
    .Bucket(bucket)
    .Object(os.path.join(prefix, 'train/train.csv'))
    .upload_file('../data/xgboost_train.csv')
)
s3_input_validation = (
    boto3.Session().resource('s3')
    .Bucket(bucket)
    .Object(os.path.join(prefix, 'validation/validation.csv'))
    .upload_file('../data/xgboost_validation.csv')
)

Let's train!

In [45]:
# Create an XGBoost estimator
xgb = sagemaker.estimator.Estimator(
    image_name=get_image_uri(boto3.Session().region_name, 'xgboost'),
    role=role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.xlarge',
    output_path='s3://{}/{}/output'.format(bucket, prefix),
    sagemaker_session=sagemaker_session,
    base_job_name='scientist-12-xgboost-regression-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime()),
)

# Set the specific hyperparameters
xgb.set_hyperparameters(
    eta=0.2,
    objective='reg:linear', # This is a regression problem
    num_round=50,
    max_depth=5,
    gamma=4,
    min_child_weight=6,
    subsample=0.7,
    silent=0,
    eval_metric="mae" # Use the MAE evaluation metric for reference to the Kaggle leaderboard
)

# Train the model using the training and validation data
xgb.fit({
    'train': sagemaker.session.s3_input(
        s3_data='s3://{0}/{1}/train/train.csv'.format(bucket, prefix),
        content_type='csv'
    ), 
    'validation': sagemaker.session.s3_input(
        s3_data='s3://{0}/{1}/validation/validation.csv'.format(bucket, prefix),
        content_type='csv'
    )
})

INFO:sagemaker:Creating training-job with name: scientist-12-xgboost-regression-2019-02-2019-02-13-22-27-54-560


2019-02-13 22:27:54 Starting - Starting the training job...
2019-02-13 22:27:55 Starting - Launching requested ML instances.........
2019-02-13 22:29:35 Starting - Preparing the instances for training......
2019-02-13 22:30:46 Downloading - Downloading input data...
2019-02-13 22:31:16 Training - Training image download completed. Training in progress.
[31mArguments: train[0m
[31m[2019-02-13:22:31:17:INFO] Running standalone xgboost training.[0m
[31m[2019-02-13:22:31:17:INFO] File size need to be processed in the node: 397.88mb. Available memory size in the node: 8394.09mb[0m
[31m[2019-02-13:22:31:17:INFO] Determined delimiter of CSV input is ','[0m
[31m[22:31:17] S3DistributionType set as FullyReplicated[0m
[31m[22:31:19] 465304x159 matrix with 73983336 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-02-13:22:31:19:INFO] Determined delimiter of CSV input is ','[0m
[31m[22:31:19] S3DistributionType set as FullyReplicated[0

[31m[22:32:45] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[39]#011train-mae:2.12015e+06#011validation-mae:2.26017e+06[0m
[31m[22:32:47] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[40]#011train-mae:2.11657e+06#011validation-mae:2.26197e+06[0m
[31m[22:32:49] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[41]#011train-mae:2.11272e+06#011validation-mae:2.26131e+06[0m
[31m[22:32:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[42]#011train-mae:2.10901e+06#011validation-mae:2.25816e+06[0m
[31m[22:32:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[43]#011train-mae:2.10822e+06#011validation-mae:2.25789e+06[0m
[31m[22:32:55] src/tree/updater_prune.cc:74: tree

In [51]:
print(xgb.training_job_analytics.dataframe(force_refresh=True))



      metric_name  timestamp         value
0  validation:mae        0.0  4.636516e+06
1  validation:mae       60.0  2.282878e+06
2  validation:mae      120.0  2.251200e+06


In [52]:
objective_metric_name = 'Validation-mae'
metric_definitions = [{'Name': 'Validation-mae',
                       'Regex': 'Validation-mae:([0-9\\.]+)'}]

In [53]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, CategoricalParameter, ContinuousParameter
hyperparameter_ranges = {
    'eta': ContinuousParameter(0.0, 1.0),
    'min_child_weight': ContinuousParameter(1.0, 10.0),
    'alpha': ContinuousParameter(0.0, 2.0),
    'max_depth': IntegerParameter(1, 10)
}

In [None]:
xgb_tune = sagemaker.estimator.Estimator(
    image_name=get_image_uri(boto3.Session().region_name, 'xgboost'),
    role=role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.xlarge',
    output_path='s3://{}/{}/output'.format(bucket, prefix),
    sagemaker_session=sagemaker_session,
    base_job_name='scientist-12-xgboost-regression-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime()),
)

# Create the hyperparameter tuner
tuner = HyperparameterTuner(
    xgb_tune,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=20,
    max_parallel_jobs=3
)

# Create and run the hyperparameter tuning jobs
tuner.fit({
    'train': sagemaker.session.s3_input(
        s3_data='s3://{0}/{1}/train/train.csv'.format(bucket, prefix),
        content_type='csv'
    ), 
    'validation': sagemaker.session.s3_input(
        s3_data='s3://{0}/{1}/validation/validation.csv'.format(bucket, prefix),
        content_type='csv'
    )
})

In [None]:
# Deploy the model to an endpoint
#xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')