# Prediction Using Ensamble Tree

1. Input: a table named "total", which includes information about this hour (temp,weekday..)
2. output: the value of this hour's energy load

## Data Preperation

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import csv
import os
import csv
import numpy as np
import matplotlib.pyplot as plt
import time
from datetime import datetime
import pytz
import math

In [2]:
################
#READ LOAD DATA#
################
# set timezone
local_tz = pytz.timezone('America/Denver')

# read energy load from csv file
energy_load_read = pd.DataFrame.from_csv('test_data/6.csv')['value']

# convert energy load's index from timestamp to datetime format
energy_index = [datetime.fromtimestamp(x,tz=local_tz) for x in energy_load_read.index]

# reset the load's index
energy_load_raw = pd.DataFrame({'Value':list(energy_load_read)},index=energy_index)

# resample to hourly load
energy_load = energy_load_raw.resample("H").mean().fillna(method='ffill')

In [3]:
################
#READ TEMP DATA#
################
# import hourly temp
temp = pd.DataFrame.from_csv('test_data/6_temp.csv')
temp = temp[['WetBulbFarenheit','Date','Time']]

# store temp in data, datetime in index
data = [value for value in temp['WetBulbFarenheit']]
index = []

for j in temp.iterrows():

    c = datetime(int(str(j[1].Date)[0:4]),int(str(j[1].Date)[4:6]),int(str(j[1].Date)[6:8]),int(math.floor(j[1].Time/100)),int(j[1].Time-(math.floor(j[1].Time/100)*100)))
    index.append(c.replace(tzinfo=local_tz))

ts_temp = pd.DataFrame({'Temp':data},index = index)
def filter_num(x):
    try:
        return np.float(x)
    except:
        return np.nan
ts_temp['Temp'] = ts_temp['Temp'].apply(filter_num)
hourly_temp = ts_temp.copy().resample("H").mean().fillna(method='ffill')

In [4]:
total = pd.concat([hourly_temp,energy_load],axis=1).dropna()

In [5]:
total["Month"] = total.index.map(lambda x: x.month)
total["Weekday"] = total.index.map(lambda x: x.isoweekday())
total["Hour"] = total.index.map(lambda x: x.hour)

In [6]:
total.head()

Unnamed: 0,Temp,Value,Month,Weekday,Hour
2012-01-01 00:00:00-07:00,20.666667,34.094725,1,7,0
2012-01-01 01:00:00-07:00,20.333333,34.168583,1,7,1
2012-01-01 02:00:00-07:00,20.0,34.844692,1,7,2
2012-01-01 03:00:00-07:00,20.666667,35.95695,1,7,3
2012-01-01 04:00:00-07:00,19.0,35.518508,1,7,4


In [7]:
# Temp1 means one hour before this hour
move = list(total["Temp"])[:-1]
move.insert(0,np.nan)
total["Temp1"]= move

# Temp2 means two hours before this hour
move = list(total["Temp"])[:-2]
move.insert(0,np.nan)
move.insert(0,np.nan)
total["Temp2"]= move

# Temp3 means three hours before this hour
move = list(total["Temp"])[:-3]
move.insert(0,np.nan)
move.insert(0,np.nan)
move.insert(0,np.nan)
total["Temp3"]= move

In [8]:
total.head()

Unnamed: 0,Temp,Value,Month,Weekday,Hour,Temp1,Temp2,Temp3
2012-01-01 00:00:00-07:00,20.666667,34.094725,1,7,0,,,
2012-01-01 01:00:00-07:00,20.333333,34.168583,1,7,1,20.666667,,
2012-01-01 02:00:00-07:00,20.0,34.844692,1,7,2,20.333333,20.666667,
2012-01-01 03:00:00-07:00,20.666667,35.95695,1,7,3,20.0,20.333333,20.666667
2012-01-01 04:00:00-07:00,19.0,35.518508,1,7,4,20.666667,20.0,20.333333


In [9]:
total = total.dropna()

## Function Preperation--calculate absolute error

In [10]:
def cal_error_for_list(pred,actual):
    error_abs=[]
    for i in range(len(pred)):
        error_abs.append(abs(pred[i]-actual[i]))
    actual_sum = np.array(actual).sum()
    error_sum = np.array(error_abs).sum()
    print "Mape Error = ",float(error_sum)/float(actual_sum)
    return error_sum,actual_sum

## Random Forest

In [11]:
target = total[["Value"]]
feature = total.copy().drop("Value",axis=1)
Y = target.values.reshape(1,-1)[0]
X = feature.values
feature.head()

Unnamed: 0,Temp,Month,Weekday,Hour,Temp1,Temp2,Temp3
2012-01-01 03:00:00-07:00,20.666667,1,7,3,20.0,20.333333,20.666667
2012-01-01 04:00:00-07:00,19.0,1,7,4,20.666667,20.0,20.333333
2012-01-01 05:00:00-07:00,19.0,1,7,5,19.0,20.666667,20.0
2012-01-01 06:00:00-07:00,19.0,1,7,6,19.0,19.0,20.666667
2012-01-01 07:00:00-07:00,18.0,1,7,7,19.0,19.0,19.0


In [12]:
# cal insample error
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(n_estimators=500)
clf = clf.fit(X, Y)
pred = clf.predict(X)
cal_error_for_list(pred,Y)

Mape Error =  0.0227962863501


(6212.4943104425647, 272522.20888181822)

In [13]:
# see which feature is important
print "Importance of each feature\n"
for name,score in zip(feature.columns,clf.feature_importances_ ):
    print name,":",score

Importance of each feature

Temp : 0.0277043980194
Month : 0.0514023019892
Weekday : 0.0663192192687
Hour : 0.738053508303
Temp1 : 0.0235635741863
Temp2 : 0.0352346520052
Temp3 : 0.057722346228


In [14]:
# cal outsample error using random sample points 
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
ss = cross_validation.ShuffleSplit(len(X), 20, 0.05, random_state=0)
error_list = []
actual_list = []

for train,test in ss:
    clf = RandomForestRegressor(n_estimators=500)
    clf = clf.fit(X[train], Y[train])
    pred = clf.predict(X[test])
    error,actual = cal_error_for_list(pred, Y[test])
    error_list.append(error)
    actual_list.append(actual)
error_sum = np.array(error_list).sum()
actual_sum = np.array(actual_list).sum()
print "Total Mape Error = ",float(error_sum)/float(actual_sum)

Mape Error =  0.0637984285351
Mape Error =  0.0640163730192
Mape Error =  0.056468828173
Mape Error =  0.0598899363505
Mape Error =  0.0609153966851
Mape Error =  0.0606542033679
Mape Error =  0.0633497278109
Mape Error =  0.0651670193446
Mape Error =  0.066028074762
Mape Error =  0.0647033525819
Mape Error =  0.0682884924061
Mape Error =  0.0596424937861
Mape Error =  0.0620503825924
Mape Error =  0.0643065358911
Mape Error =  0.062975244471
Mape Error =  0.065511481739
Mape Error =  0.0660761037187
Mape Error =  0.0605120635414
Mape Error =  0.0597143570749
Mape Error =  0.054396850766
Total Mape Error =  0.062435266276


## XGBoost 

In [16]:
import xgboost as xgb

In [17]:
target = total[["Value"]]
feature = total.copy().drop("Value",axis=1)
Y = target.values.reshape(1,-1)[0]
X = feature.values
feature.head()

Unnamed: 0,Temp,Month,Weekday,Hour,Temp1,Temp2,Temp3
2012-01-01 03:00:00-07:00,20.666667,1,7,3,20.0,20.333333,20.666667
2012-01-01 04:00:00-07:00,19.0,1,7,4,20.666667,20.0,20.333333
2012-01-01 05:00:00-07:00,19.0,1,7,5,19.0,20.666667,20.0
2012-01-01 06:00:00-07:00,19.0,1,7,6,19.0,19.0,20.666667
2012-01-01 07:00:00-07:00,18.0,1,7,7,19.0,19.0,19.0


In [18]:
# config

n_estimators = 1000

# encounter overfitting directly
max_depth = 5
min_child_weight = 0.6
gamma =2

# encounter overfitting by add randomness
subsample = 0.3
seed = 3


In [19]:
# insample error
xgb_model = xgb.XGBRegressor(
    n_estimators = n_estimators,
    max_depth=max_depth,
    min_child_weight = min_child_weight,
    gamma=gamma).fit(X,Y)
pred = xgb_model.predict(X)
cal_error_for_list(pred,Y)

Mape Error =  0.0428014809218


(11664.354124217271, 272522.20888181822)

In [20]:
# cal outsample error using random sample
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
ss = cross_validation.ShuffleSplit(len(X), 20, 0.05, random_state=0)
error_list = []
actual_list = []

for train,test in ss:
    xgb_model = xgb.XGBRegressor(
        n_estimators = n_estimators,
        max_depth=max_depth,
        min_child_weight = min_child_weight
        ,gamma=gamma).fit(X[train], Y[train])
    pred = xgb_model.predict(X[test])
    error,actual = cal_error_for_list(pred, Y[test])
    error_list.append(error)
    actual_list.append(actual)
error_sum = np.array(error_list).sum()
actual_sum = np.array(actual_list).sum()
print "Total Mape Error = ",float(error_sum)/float(actual_sum)

Mape Error =  0.0656704654939
Mape Error =  0.0625974930604
Mape Error =  0.0600487899936
Mape Error =  0.0626963399666
Mape Error =  0.0616486078484
Mape Error =  0.0636688584054
Mape Error =  0.0658171422327
Mape Error =  0.0649343954459
Mape Error =  0.0665624503311
Mape Error =  0.0625891742535
Mape Error =  0.0695967593038
Mape Error =  0.0627668279621
Mape Error =  0.0640162909959
Mape Error =  0.0664627171903
Mape Error =  0.0641266409022
Mape Error =  0.064460331614
Mape Error =  0.0679552217653
Mape Error =  0.0614682337631
Mape Error =  0.0602538838426
Mape Error =  0.0553693212928
Total Mape Error =  0.0636387967911
