In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import xgboost as xgb
from xgboost import plot_importance, plot_tree, to_graphviz

import utility_functions as fn

In [None]:
energy = pd.read_csv('data/energy_clean.csv',index_col=0)
energy.index = pd.to_datetime(energy.index,format='%Y-%m-%d %H:00:00')

# removing Davenport for now because it is missing most of June and July
energy.drop('Davenport',axis=1,inplace=True)

energy['Yale_kW']= energy.iloc[:, 0:11].sum(axis=1)
energy = pd.DataFrame(energy['Yale_kW'])

In [None]:
weather = pd.read_csv('data/weather_clean.csv',index_col=0)
weather.index = pd.to_datetime(weather.index,format='%Y-%m-%d %H:00:00')

weather = fn.add_hours_before(weather,np.arange(1,24))
weather = weather.loc[energy.index[0]:energy.index[len(energy)-1]]

In [None]:
# dates = pd.read_csv('one_hot_dates.csv',index_col=0)
# dates.index = pd.to_datetime(dates.index,format='%Y-%m-%d %H:00:00')
# dates = dates.loc[energy.index[0]:energy.index[len(energy)-1]]
dates = fn.date_features(energy)

In [None]:
split_date = '2018-07-26 00:00:00'

y_train = energy.loc[energy.index <= split_date].copy()
y_test = energy.loc[energy.index > split_date].copy()

In [None]:
weather_train = weather.loc[weather.index <= split_date].copy()
weather_test = weather.loc[weather.index > split_date].copy()

dates_train = dates.loc[dates.index <= split_date].copy()
dates_test = dates.loc[dates.index > split_date].copy()

X_train = weather_train.join(dates_train)
X_test = weather_test.join(dates_test)

In [None]:
reg = xgb.XGBRegressor(n_estimators=1000)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=50,
        verbose=False)

In [None]:
_ = plot_importance(reg, height=0.7,max_num_features=15)

In [None]:
y_test['Prediction_kW'] = reg.predict(X_test)
yale_all = pd.concat([y_test, y_train], sort=False)
_ = yale_all[['Yale_kW','Prediction_kW']].plot(figsize=(15, 5))

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    """Calculates MAPE given y_true and y_pred"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
MAPE = mean_absolute_percentage_error(y_true=y_test['Yale_kW'], y_pred=y_test['Prediction_kW']).round(2)
MAPE

In [None]:
fn.plot_all(y_test,'2018-07-26 01:00:00','2018-07-27 23:00:00')