In [2]:
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from numpy import concatenate
from sklearn.metrics import mean_squared_error

In [3]:
# read in the data
df = pd.read_csv("./../data/changes.csv")

In [4]:
df.set_index('date', inplace=True)
print(df.shape)
df.head()

(7115050, 47)


Unnamed: 0_level_0,e5gas,latitude,longitude,dautobahn,autobahn,aral,esso,jet,shell,total,...,state_15,state_16,state_17,state_18,state_19,state_20,state_21,state_23,state_25,state_26
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-05-16,1.536647,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-17,1.564,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-18,1.569417,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-19,1.578167,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-20,1.599,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# try removing the averages so as to not include endogenous variables on the right hand side
df.drop(['eurusd', 'vehicles'], axis=1, inplace=True)
print(df.shape)
df.head()

(7115050, 45)


Unnamed: 0_level_0,e5gas,latitude,longitude,dautobahn,autobahn,aral,esso,jet,shell,total,...,state_15,state_16,state_17,state_18,state_19,state_20,state_21,state_23,state_25,state_26
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-05-16,1.536647,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-17,1.564,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-18,1.569417,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-19,1.578167,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-20,1.599,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# replace the oil prices for the last 30 days with the predictions
oil = pd.read_csv('./../data/linear_oil_yhat.csv')

last_30 = ['2015-11-12', '2015-11-13', '2015-11-14', '2015-11-15', '2015-11-16', '2015-11-17', '2015-11-18',
          '2015-11-19', '2015-11-20', '2015-11-21', '2015-11-22', '2015-11-23', '2015-11-24', '2015-11-25',
          '2015-11-26', '2015-11-27', '2015-11-28', '2015-11-29', '2015-11-30', '2015-12-01', '2015-12-02',
          '2015-12-03', '2015-12-04', '2015-12-05', '2015-12-06', '2015-12-07', '2015-12-08', '2015-12-09',
          '2015-12-10', '2015-12-10']

for index, date in enumerate(last_30):
    df.loc[date, 'rotterdam'] = oil['rot_yhat'][index]
    df.loc[date, 'brent'] = oil['brent_yhat'][index]
    df.loc[date, 'wti'] = oil['wti_yhat'][index]

In [7]:
# now order needs to be by day rather than by station -- reorder by num_days
df.sort_values(by=['num_days', 'station'], inplace=True)
df.head()

Unnamed: 0_level_0,e5gas,latitude,longitude,dautobahn,autobahn,aral,esso,jet,shell,total,...,state_15,state_16,state_17,state_18,state_19,state_20,state_21,state_23,state_25,state_26
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-05-16,1.536647,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-16,1.537824,53.519798,10.0004,538.870667,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-16,1.588412,48.8946,10.0005,12108.902344,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-16,1.607947,49.9118,10.0018,270.597382,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-16,1.54525,49.793301,10.0023,3399.334473,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# split into train and test sets
day_30 = df['num_days'].unique()[-30]
train = df.loc[df['num_days'] < day_30]
test = df.loc[df['num_days'] >= day_30]

In [9]:
train.head()

Unnamed: 0_level_0,e5gas,latitude,longitude,dautobahn,autobahn,aral,esso,jet,shell,total,...,state_15,state_16,state_17,state_18,state_19,state_20,state_21,state_23,state_25,state_26
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-05-16,1.536647,51.157501,10.0002,14850.392578,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-16,1.537824,53.519798,10.0004,538.870667,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-16,1.588412,48.8946,10.0005,12108.902344,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-16,1.607947,49.9118,10.0018,270.597382,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2014-05-16,1.54525,49.793301,10.0023,3399.334473,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# split into input and outputs
train_X, train_y = train[train.columns.difference(['e5gas', 'station'])], train['e5gas']
test_X, test_y = test[test.columns.difference(['e5gas', 'station'])], test['e5gas']
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(6743830, 43) (6743830,) (371220, 43) (371220,)


In [16]:
#fit random forest
model = RandomForestRegressor(max_depth=25, random_state=0, n_estimators=50, n_jobs=-1)

In [17]:
model.fit(train_X, train_y.ravel())

KeyboardInterrupt: 

In [14]:
# make a prediction
yhat = model.predict(test_X)

In [15]:
# calculate RMSE
rmse = sqrt(mean_squared_error(test_y, yhat))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 0.027


With the changes!
* 25 depth, 10 est: 0.027

No lag
* 14 depth, 100 est: 0.034
* 15 depth, 100 est: 0.033

With 130-30 days training
* 15 depth, 100 est: 0.030
* 16 depth, 100 est: 0.030
* 16 depth, 150 est: 0.030
* 17 depth, 100 est: 0.029
* 18 depth, 100 est: 0.028
* 19 depth, 100 est: 0.028
* 20 depth, 100 est: 0.027
* 21 depth, 100 est: 0.027
* 22 depth, 100 est: 0.026
* 25 depth, 100 est: 0.025
* 30 depth, 100 est: 0.025
* 35 depth, 100 est: 0.025
* 50 depth, 100 est: 0.025
* 50 depth, 150 est: 0.025

In [15]:
# each day to make a graph of the error
ob = pd.DataFrame(inv_y)
pred = pd.DataFrame(inv_yhat)
pred[15] = round(pred[15])
pred[16] = round(pred[16])

In [16]:
rmse_days = []
days = []
for i in range(545,575):
    c_ob = ob[ob[16] == i][0]
    c_pred = pred[pred[16] == i][0]
    rmse_days.append(sqrt(mean_squared_error(c_ob, c_pred)))
    days.append(i)
    

In [17]:
rmse_days
days_df = pd.DataFrame()
days_df['rmse'] = rmse_days
days_df['date'] = last_30
days_df['date'] = pd.to_datetime(days_df['date'])

In [21]:
from bokeh.plotting import figure, output_file, show

output_file("line.html")

p = figure(plot_width=400, plot_height=400, x_axis_type='datetime')
p.yaxis.axis_label = 'Root Mean Squared Error'

# add a line renderer
p.line(days_df['date'], days_df['rmse'], line_width=2, color='purple')

show(p)

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 20, num = 2)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200]}


In [17]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=1, cv=2, verbose=2, random_state=42, n_jobs=-1)
# Fit the random search model
rf_random.fit(train_X, train_y)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] n_estimators=52, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=True 
[CV] n_estimators=52, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=52, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=True, total= 4.3min
[CV]  n_estimators=52, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=True, total= 4.4min


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  4.5min finished
  best_estimator.fit(X, y, **self.fit_params)


RandomizedSearchCV(cv=2, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=1, n_jobs=-1,
          param_distributions={'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring=None, verbose=2)

In [18]:
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 52}