In [125]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

In [126]:
# read in the data
df = pd.read_csv("./../data/supervised_1_1.csv")

In [127]:
# format data
# take only the the oil price
# predict the final 30 days
# save the test and train data

In [128]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [129]:
df.set_index('date', inplace=True)
print(df.shape)
df.head()

(7316300, 21)


Unnamed: 0_level_0,e5gas,weekday,latitude,longitude,dautobahn,autobahn,aral,esso,jet,shell,...,rotterdam,brent,wti,eurusd,vehicles,state,station,global_mean,state_mean,num_days
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-05-16,1.536647,5,51.157501,10.0002,14850.392578,0,0,0,0,0,...,0.622837,109.424042,101.258553,1.369886,15211.0,10,0,1.546131,1.561237,0
2014-05-17,1.564,6,51.157501,10.0002,14850.392578,0,0,0,0,0,...,0.622486,109.699997,101.669998,1.36958,9146.0,10,0,1.560708,1.578196,1
2014-05-18,1.569417,0,51.157501,10.0002,14850.392578,0,0,0,0,0,...,0.622136,109.699997,101.669998,1.36958,9843.0,10,0,1.564237,1.581731,2
2014-05-19,1.578167,1,51.157501,10.0002,14850.392578,0,0,0,0,0,...,0.621785,109.838928,102.008537,1.370161,13502.0,10,0,1.563935,1.581525,3
2014-05-20,1.599,2,51.157501,10.0002,14850.392578,0,0,0,0,0,...,0.620475,109.522926,102.058907,1.368282,12876.0,10,0,1.566228,1.58262,4


In [130]:
# try removing the averages so as to not include endogenous variables on the rigth hand side
df.drop(['global_mean', 'state_mean', 'eurusd', 'vehicles'], axis=1, inplace=True)
print(df.shape)
df.head()

(7316300, 17)


Unnamed: 0_level_0,e5gas,weekday,latitude,longitude,dautobahn,autobahn,aral,esso,jet,shell,total,rotterdam,brent,wti,state,station,num_days
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2014-05-16,1.536647,5,51.157501,10.0002,14850.392578,0,0,0,0,0,0,0.622837,109.424042,101.258553,10,0,0
2014-05-17,1.564,6,51.157501,10.0002,14850.392578,0,0,0,0,0,0,0.622486,109.699997,101.669998,10,0,1
2014-05-18,1.569417,0,51.157501,10.0002,14850.392578,0,0,0,0,0,0,0.622136,109.699997,101.669998,10,0,2
2014-05-19,1.578167,1,51.157501,10.0002,14850.392578,0,0,0,0,0,0,0.621785,109.838928,102.008537,10,0,3
2014-05-20,1.599,2,51.157501,10.0002,14850.392578,0,0,0,0,0,0,0.620475,109.522926,102.058907,10,0,4


In [131]:
rot = df['rotterdam'][:575]
brent = df['brent'][:575]
wti = df['wti'][:575]

Predict Rotterdam.

In [133]:
# define rot_X as lagged one day and rot_y as rot
rot_y = rot.copy()
rot_x = rot.copy()
# pop the first observation on rot_y and the last on rot_x
rot_y.drop(['2014-05-16'], inplace=True)
rot_x.drop(['2015-12-11'], inplace=True)
# take last 30 days as test
train_rot_x = rot_x[:-31]
train_rot_y = rot_y[:-31]
test_rot_x = rot_x[-31:]
test_rot_y = rot_y[-31:]
# grab the values
v_train_rot_x = train_rot_x.values
v_train_rot_y = train_rot_y.values
v_test_rot_x = test_rot_x.values
v_test_rot_y = test_rot_y.values
# fit model for rot
rot_model = LinearRegression()
rot_model.fit(v_train_rot_x.reshape(543,1), v_train_rot_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [134]:
rot_yhat = rot_model.predict(v_test_rot_x.reshape(31,1))

In [135]:
rot_rmse = sqrt(mean_squared_error(test_rot_y, rot_yhat))
rot_rmse

0.0126370523086607

Now, for Brent.

In [136]:
# define brent_X as lagged one day and brent_y as brent
brent_y = brent.copy()
brent_x = brent.copy()
# pop the first observation on brent_y and the last on brent_x
brent_y.drop(['2014-05-16'], inplace=True)
brent_x.drop(['2015-12-11'], inplace=True)
# take last 30 days as test
train_brent_x = brent_x[:-31]
train_brent_y = brent_y[:-31]
test_brent_x = brent_x[-31:]
test_brent_y = brent_y[-31:]
# grab the values
v_train_brent_x = train_brent_x.values
v_train_brent_y = train_brent_y.values
v_test_brent_x = test_brent_x.values
v_test_brent_y = test_brent_y.values
# fit model for rot
brent_model = LinearRegression()
brent_model.fit(v_train_brent_x.reshape(543,1), v_train_brent_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [137]:
brent_yhat = brent_model.predict(v_test_brent_x.reshape(31,1))

In [138]:
brent_rmse = sqrt(mean_squared_error(test_brent_y, brent_yhat))
brent_rmse

0.8439200281866971

Now for WTI

In [139]:
# define wti_X as lagged one day and wti_y as brent
wti_y = wti.copy()
wti_x = wti.copy()
# pop the first observation on brent_y and the last on brent_x
wti_y.drop(['2014-05-16'], inplace=True)
wti_x.drop(['2015-12-11'], inplace=True)
# take last 30 days as test
train_wti_x = wti_x[:-31]
train_wti_y = wti_y[:-31]
test_wti_x = wti_x[-31:]
test_wti_y = wti_y[-31:]
# grab the values
v_train_wti_x = train_wti_x.values
v_train_wti_y = train_wti_y.values
v_test_wti_x = test_wti_x.values
v_test_wti_y = test_wti_y.values
# fit model for wti
wti_model = LinearRegression()
wti_model.fit(v_train_wti_x.reshape(543,1), v_train_wti_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [140]:
wti_yhat = wti_model.predict(v_test_wti_x.reshape(31,1))

In [141]:
wti_rmse = sqrt(mean_squared_error(test_wti_y, wti_yhat))
wti_rmse

0.5823761770435268

Now prepare the data to be used for the linear model and save.

In [142]:
linear_oil_yhat = {'rot_yhat': rot_yhat, 'brent_yhat': brent_yhat, 'wti_yhat': wti_yhat}

In [143]:
linear_oil_yhat_df = pd.DataFrame(data=linear_oil_yhat)

In [144]:
linear_oil_yhat_df.to_csv('./../data/linear_oil_yhat.csv')