In [11]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [1]:
# read in the data
dataset = pd.read_csv('./../data/prepared_gasoline.csv')

# read in the geo data
geo = pd.read_csv('latlongeo.csv')
geo.drop(['Unnamed: 0', 'region'], axis=1, inplace=True)

# join them -- inner drops the lat/lon nan values
dataset = dataset.merge(geo, how='inner', on=['latitude', 'longitude'])

# add global mean of gas price for each observation
global_mean = dataset.groupby('date')['e5gas'].mean()
global_df = global_mean.to_frame()
global_df.rename(columns={"e5gas": "global_mean"}, inplace=True)
dataset = dataset.merge(global_df, right_index=True, left_on='date')

# add regional mean of gas price for each observation
state_mean = dataset.groupby(['date', 'state'])['e5gas'].mean()
state_df = state_mean.to_frame()
state_df.rename(columns={"e5gas": "state_mean"}, inplace=True)
dataset = dataset.merge(state_df, right_index=True, left_on=['date', 'state'])

NameError: name 'pd' is not defined

In [13]:
station_indices = dataset['station'].value_counts() == 575
full_series = pd.DataFrame()
full_series['T/F'] = station_indices[station_indices]
full_series['index'] = full_series.index
dataset_fullseries = dataset[dataset['station'].isin(full_series['index'])]

In [14]:
df = dataset_fullseries.copy()
df.drop(['latitude', 'longitude', 'dautobahn', 'autobahn', 'aral', 'esso', 'jet', 'shell', 'total', 'state'], axis=1, inplace=True)
df['e5gas'].fillna(0, inplace=True)
cols = list(df)
cols.insert(0, cols.pop(cols.index('e5gas')))
df = df.loc[:, cols]
df.head()

Unnamed: 0,e5gas,date,weekday,rotterdam,brent,wti,eurusd,vehicles,station,global_mean,state_mean
42953,1.41025,2015-03-18,3,0.462206,53.718338,44.879154,1.06255,34005.0,8269,1.381133,1.383179
89210,1.413583,2015-03-18,3,0.462206,53.718338,44.879154,1.06255,11479.0,11309,1.381133,1.383179
108893,1.428583,2015-03-18,3,0.462206,53.718338,44.879154,1.06255,7804.0,10294,1.381133,1.383179
110043,1.430667,2015-03-18,3,0.462206,53.718338,44.879154,1.06255,7804.0,9669,1.381133,1.383179
119652,1.428583,2015-03-18,3,0.462206,53.718338,44.879154,1.06255,7804.0,10202,1.381133,1.383179


In [15]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [18]:
rmse = pd.read_csv('rmse_dict.csv', sep=' ', header=None, names=['index', 'rmse'])
used_index = list(rmse['index'])

In [19]:
index_list = full_series.index
index_list = [x for x in index_list if x not in used_index]
RMSE_dict = {}

In [None]:
for index in index_list:
    s = df[df['station'] == index]
    s.set_index('date', inplace=True)
    s.sort_index(inplace=True)
    s.drop('station', axis=1, inplace=True)
    values = s.values
    # ensure all data is float
    values = values.astype('float32')
    # normalize features
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler.fit_transform(values)
    # frame as supervised learning
    reframed = series_to_supervised(scaled, 1, 1)
    # drop columns we don't want to predict
    reframed.drop(reframed.columns[[10, 11, 12, 13, 14, 15, 16, 17]], axis=1, inplace=True)
    
    # split into train and test sets
    values = reframed.values
    n_train_days = int(575*.8)
    train = values[:n_train_days, :]
    test = values[n_train_days:, :]
    # split into input and outputs
    train_X, train_y = train[:, :-1], train[:, -1]
    test_X, test_y = test[:, :-1], test[:, -1]
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
    test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
    
    # design network
    model = Sequential()
    model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')
    # fit network
    history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=0, shuffle=False)
    
    # make a prediction
    yhat = model.predict(test_X)
    test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
    # invert scaling for forecast
    inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
    inv_yhat = scaler.inverse_transform(inv_yhat)
    inv_yhat = inv_yhat[:,0]
    # invert scaling for actual
    test_y = test_y.reshape((len(test_y), 1))
    inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
    inv_y = scaler.inverse_transform(inv_y)
    inv_y = inv_y[:,0]
    # calculate RMSE
    rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
    
    # add to dict
    RMSE_dict[index] = rmse
    print(index, rmse, '\n\n')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


1322 0.013852819828836094 


5416 0.014153909744191832 


9517 0.04154190259613669 


5419 0.010687995149258273 


9516 0.009950440938448595 


5418 0.025348017779785046 


1320 0.019421444622459386 


1319 0.029461771117471842 


5413 0.012876047124079837 


9507 0.013019881424778517 


13601 0.014483744938002821 


5423 0.010375414861173089 


13608 0.013275470948912852 


5420 0.016570573346925688 


13623 0.01469498827390014 


1332 0.030118136298229695 


13618 0.01649597720970614 


9520 0.03125823553166619 


9527 0.0136110932279531 


13621 0.00904973834846426 


1331 0.008477607897572513 


5425 0.012699817009637047 


9526 0.02154348583651377 


1330 0.0207011716467714 


5424 0.014032961594282328 


9525 0.017530231995464798 


1326 0.013943355143946241 


5427 0.012052538878254429 


1329 0.01610526242998347 


13622 0.016240836952084607 


9524 0.01592945411479826 


5426 0.024808442496131622 


1328 0.019358142742415204 


1327 0.012045932131110526 


5421 0.0148629627893

In [None]:
import csv

with open('rmse_dict.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in rmse_dict.items():
       writer.writerow([key, value])