# XGBoost Forecasting

Use the xgboost algorithm to forecase the solar energy output of each solar station using the GEFS numerical simulation data, aggregated to the daily level and averaged over all the ensembles. 

In [15]:
%matplotlib inline

import sys
import json
import random
import datetime
import os

import boto3
import sagemaker
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from time import strftime, gmtime
from sagemaker.amazon.amazon_estimator import get_image_uri

Load some SageMaker specific variables

In [4]:
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()

Load the consolidated data and drop some unuseful columns

In [7]:
df = pd.read_csv("../data/daily_training_data_combined.csv")
df.drop(
    [
        "ens mean", 
        "ens max", 
        "ens min", 
        "ens std"
    ], 
    axis=1,
    inplace=True
)
df.head()

Unnamed: 0,lat,lon,time,Upward_Long-Wave_Rad_Flux_surface mean,Upward_Long-Wave_Rad_Flux_surface min,Upward_Long-Wave_Rad_Flux_surface std,Upward_Long-Wave_Rad_Flux_surface max,Upward_Short-Wave_Rad_Flux mean,Upward_Short-Wave_Rad_Flux min,Upward_Short-Wave_Rad_Flux std,...,Pressure mean,Pressure min,Pressure std,Pressure max,datetime,station,distance_km,gefs_lat,gefs_lon,measured_solar_output
0,31.0,254.0,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,...,102217.904727,101916.42,184.451255,102476.81,19940101,ACME,856.245389,31.0,254.0,12384900
1,31.0,254.0,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,...,102217.904727,101916.42,184.451255,102476.81,19940101,ADAX,967.568287,31.0,254.0,11930700
2,31.0,254.0,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,...,102217.904727,101916.42,184.451255,102476.81,19940101,APAC,840.192442,31.0,254.0,12301200
3,31.0,254.0,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,...,102217.904727,101916.42,184.451255,102476.81,19940101,BIXB,1088.982975,31.0,254.0,11182800
4,31.0,254.0,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,...,102217.904727,101916.42,184.451255,102476.81,19940101,BLAC,1029.357771,31.0,254.0,10848300


Let's encode the station variable, since it's a categorical variable here. This way, we'll train a regressor for all the solar sites at once.

In [8]:
df = pd.get_dummies(df, columns=['station'])
df.head()

Unnamed: 0,lat,lon,time,Upward_Long-Wave_Rad_Flux_surface mean,Upward_Long-Wave_Rad_Flux_surface min,Upward_Long-Wave_Rad_Flux_surface std,Upward_Long-Wave_Rad_Flux_surface max,Upward_Short-Wave_Rad_Flux mean,Upward_Short-Wave_Rad_Flux min,Upward_Short-Wave_Rad_Flux std,...,station_VINI,station_WASH,station_WATO,station_WAUR,station_WEAT,station_WEST,station_WILB,station_WIST,station_WOOD,station_WYNO
0,31.0,254.0,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,...,0,0,0,0,0,0,0,0,0,0
1,31.0,254.0,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,...,0,0,0,0,0,0,0,0,0,0
2,31.0,254.0,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,...,0,0,0,0,0,0,0,0,0,0
3,31.0,254.0,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,...,0,0,0,0,0,0,0,0,0,0
4,31.0,254.0,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df["time"].max()

'2007-12-31'

Let's move the solaroutput column to be the first one. The XGBoost regressor expects the first column to be the target to predict, with the remaining/following columns being the features; we'll simulaneously remove the gefs lat and lon coordinates as well, since they aren't useful features here.

In [10]:
dataset_df = pd.concat(
    [df['measured_solar_output'], df.drop(['measured_solar_output', 'lat', 'lon'], axis=1)], 
    axis=1
) 
dataset_df.head()

Unnamed: 0,measured_solar_output,time,Upward_Long-Wave_Rad_Flux_surface mean,Upward_Long-Wave_Rad_Flux_surface min,Upward_Long-Wave_Rad_Flux_surface std,Upward_Long-Wave_Rad_Flux_surface max,Upward_Short-Wave_Rad_Flux mean,Upward_Short-Wave_Rad_Flux min,Upward_Short-Wave_Rad_Flux std,Upward_Short-Wave_Rad_Flux max,...,station_VINI,station_WASH,station_WATO,station_WAUR,station_WEAT,station_WEST,station_WILB,station_WIST,station_WOOD,station_WYNO
0,12384900,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,158.0,...,0,0,0,0,0,0,0,0,0,0
1,11930700,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,158.0,...,0,0,0,0,0,0,0,0,0,0
2,12301200,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,158.0,...,0,0,0,0,0,0,0,0,0,0
3,11182800,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,158.0,...,0,0,0,0,0,0,0,0,0,0
4,10848300,1994-01-01,367.065219,324.0,40.295247,422.75583,68.781818,0.0,63.209438,158.0,...,0,0,0,0,0,0,0,0,0,0


For algorithms that rely on distance metrics, we'd normally preprocess the feature values by standardizing them with something like scikit-learn's StandardScaler. Since the XGBoost algorithm relies fundamentally on decision trees, feature scaling isn't necessary here. XGBoost also naturally handles missing data values as well. 

Split the data into a training and a validation set. Data before 2007-01-01 will form the training dataset, while data after will be the validation data. 

In [11]:
split_time = '2007-01-01'
training_dataset_df = dataset_df[dataset_df["time"] < split_time].copy(deep=True)
validation_dataset_df = dataset_df[dataset_df["time"] >= split_time].copy(deep=True)

Let's write the data to csv files and then send it to the relevant S3 bucket for retrieval by SageMaker during training and tuning.

In [13]:
training_dataset_df.to_csv('../data/xgboost/train.csv', header=False, index=False)
validation_dataset_df.to_csv('../data/xgboost/validation.csv', header=False, index=False)

bucket = 'bright-idea'
prefix = 'xgboost'

s3_input_train = (
    boto3.Session().resource('s3')
    .Bucket(bucket)
    .Object(os.path.join(prefix, 'train/train.csv'))
    .upload_file('../data/xgboost/train.csv')
)
s3_input_validation = (
    boto3.Session().resource('s3')
    .Bucket(bucket)
    .Object(os.path.join(prefix, 'validation/validation.csv'))
    .upload_file('../data/xgboost/validation.csv')
)

Let's train!

In [16]:
# Create an XGBoost estimator
xgb = sagemaker.estimator.Estimator(
    image_name=get_image_uri(boto3.Session().region_name, 'xgboost'),
    role=role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.xlarge',
    output_path='s3://{}/{}/output'.format(bucket, prefix),
    sagemaker_session=sagemaker_session,
    base_job_name='bright-idea-xgboost-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime()),
)

# Set the specific hyperparameters
xgb.set_hyperparameters(
    eta=0.2,
    objective='reg:linear', # This is a regression problem
    num_round=50,
    max_depth=5,
    gamma=4,
    min_child_weight=6,
    subsample=0.7,
    silent=0,
    eval_metric="mae" # Use the MAE evaluation metric for reference to the Kaggle leaderboard
)

# Train the model using the training and validation data
xgb.fit({
    'train': sagemaker.session.s3_input(
        s3_data='s3://{0}/{1}/train/train.csv'.format(bucket, prefix),
        content_type='csv'
    ), 
    'validation': sagemaker.session.s3_input(
        s3_data='s3://{0}/{1}/validation/validation.csv'.format(bucket, prefix),
        content_type='csv'
    )
})

INFO:sagemaker:Creating training-job with name: bright-idea-xgboost-2019-02-17-07-06-08-2019-02-17-07-06-08-380


2019-02-17 07:06:08 Starting - Starting the training job...
2019-02-17 07:06:11 Starting - Launching requested ML instances......
2019-02-17 07:07:15 Starting - Preparing the instances for training......
2019-02-17 07:08:36 Downloading - Downloading input data...
2019-02-17 07:09:09 Training - Downloading the training image.
[31mArguments: train[0m
[31m[2019-02-17:07:09:14:INFO] Running standalone xgboost training.[0m
[31m[2019-02-17:07:09:14:INFO] File size need to be processed in the node: 504.21mb. Available memory size in the node: 8430.14mb[0m
[31m[2019-02-17:07:09:14:INFO] Determined delimiter of CSV input is ','[0m
[31m[07:09:14] S3DistributionType set as FullyReplicated[0m
[31m[07:09:16] 465304x163 matrix with 75844552 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-02-17:07:09:16:INFO] Determined delimiter of CSV input is ','[0m
[31m[07:09:16] S3DistributionType set as FullyReplicated[0m
[31m[07:09:16] 35770x163 

[31m[07:10:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[37]#011train-mae:3.82975e+06#011validation-mae:4.58551e+06[0m
[31m[07:10:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[38]#011train-mae:3.82259e+06#011validation-mae:4.58877e+06[0m
[31m[07:10:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[39]#011train-mae:3.81681e+06#011validation-mae:4.58642e+06[0m
[31m[07:10:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[40]#011train-mae:3.80702e+06#011validation-mae:4.5888e+06[0m
[31m[07:10:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[41]#011train-mae:3.7998e+06#011validation-mae:4.57974e+06[0m
[31m[07:10:26] src/tree/updater_prune.cc:74: tree p

In [51]:
print(xgb.training_job_analytics.dataframe(force_refresh=True))



      metric_name  timestamp         value
0  validation:mae        0.0  4.636516e+06
1  validation:mae       60.0  2.282878e+06
2  validation:mae      120.0  2.251200e+06


### Hyperparameter Tuning

In [52]:
objective_metric_name = 'Validation-mae'
metric_definitions = [{'Name': 'Validation-mae',
                       'Regex': 'Validation-mae:([0-9\\.]+)'}]

In [53]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, CategoricalParameter, ContinuousParameter
hyperparameter_ranges = {
    'eta': ContinuousParameter(0.0, 1.0),
    'min_child_weight': ContinuousParameter(1.0, 10.0),
    'alpha': ContinuousParameter(0.0, 2.0),
    'max_depth': IntegerParameter(1, 10)
}

In [None]:
xgb_tune = sagemaker.estimator.Estimator(
    image_name=get_image_uri(boto3.Session().region_name, 'xgboost'),
    role=role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.xlarge',
    output_path='s3://{}/{}/output'.format(bucket, prefix),
    sagemaker_session=sagemaker_session,
    base_job_name='scientist-12-xgboost-regression-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime()),
)

# Create the hyperparameter tuner
tuner = HyperparameterTuner(
    xgb_tune,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=20,
    max_parallel_jobs=3
)

# Create and run the hyperparameter tuning jobs
tuner.fit({
    'train': sagemaker.session.s3_input(
        s3_data='s3://{0}/{1}/train/train.csv'.format(bucket, prefix),
        content_type='csv'
    ), 
    'validation': sagemaker.session.s3_input(
        s3_data='s3://{0}/{1}/validation/validation.csv'.format(bucket, prefix),
        content_type='csv'
    )
})

### Deploy Model to Endpoint

In [None]:
# Deploy the model to an endpoint
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')