In [26]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [27]:
# read in the data
dataset = pd.read_csv('prepared_gasoline.csv')

# read in the geo data
geo = pd.read_csv('latlongeo.csv')
geo.drop(['Unnamed: 0', 'region'], axis=1, inplace=True)

# join them -- inner drops the lat/lon nan values
dataset = dataset.merge(geo, how='inner', on=['latitude', 'longitude'])

# add global mean of gas price for each observation
global_mean = dataset.groupby('date')['e5gas'].mean()
global_df = global_mean.to_frame()
global_df.rename(columns={"e5gas": "global_mean"}, inplace=True)
dataset = dataset.merge(global_df, right_index=True, left_on='date')

# add regional mean of gas price for each observation
state_mean = dataset.groupby(['date', 'state'])['e5gas'].mean()
state_df = state_mean.to_frame()
state_df.rename(columns={"e5gas": "state_mean"}, inplace=True)
dataset = dataset.merge(state_df, right_index=True, left_on=['date', 'state'])

In [28]:
station_indices = dataset['station'].value_counts() == 575
full_series = pd.DataFrame()
full_series['T/F'] = station_indices[station_indices]
full_series['index'] = full_series.index
dataset_fullseries = dataset[dataset['station'].isin(full_series['index'])]

In [29]:
df = dataset_fullseries.copy()
df.drop(['latitude', 'longitude', 'dautobahn', 'autobahn', 'aral', 'esso', 'jet', 'shell', 'total', 'state'], axis=1, inplace=True)
df['e5gas'].fillna(0, inplace=True)
cols = list(df)
cols.insert(0, cols.pop(cols.index('e5gas')))
df = df.loc[:, cols]
df.head()

Unnamed: 0,e5gas,date,weekday,rotterdam,brent,wti,eurusd,vehicles,station,global_mean,state_mean
42953,1.41025,2015-03-18,3,0.462206,53.718338,44.879154,1.06255,34005.0,8269,1.381133,1.383179
89210,1.413583,2015-03-18,3,0.462206,53.718338,44.879154,1.06255,11479.0,11309,1.381133,1.383179
108893,1.428583,2015-03-18,3,0.462206,53.718338,44.879154,1.06255,7804.0,10294,1.381133,1.383179
110043,1.430667,2015-03-18,3,0.462206,53.718338,44.879154,1.06255,7804.0,9669,1.381133,1.383179
119652,1.428583,2015-03-18,3,0.462206,53.718338,44.879154,1.06255,7804.0,10202,1.381133,1.383179


In [30]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [31]:
index_list = full_series.index
RMSE_dict = {}

In [None]:
for index in index_list:
    s = df[df['station'] == index]
    s.set_index('date', inplace=True)
    s.sort_index(inplace=True)
    s.drop('station', axis=1, inplace=True)
    values = s.values
    # ensure all data is float
    values = values.astype('float32')
    # normalize features
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler.fit_transform(values)
    # frame as supervised learning
    reframed = series_to_supervised(scaled, 1, 1)
    # drop columns we don't want to predict
    reframed.drop(reframed.columns[[10, 11, 12, 13, 14, 15, 16, 17]], axis=1, inplace=True)
    
    # split into train and test sets
    values = reframed.values
    n_train_days = int(575*.8)
    train = values[:n_train_days, :]
    test = values[n_train_days:, :]
    # split into input and outputs
    train_X, train_y = train[:, :-1], train[:, -1]
    test_X, test_y = test[:, :-1], test[:, -1]
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
    test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
    
    # design network
    model = Sequential()
    model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')
    # fit network
    history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=0, shuffle=False)
    
    # make a prediction
    yhat = model.predict(test_X)
    test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
    # invert scaling for forecast
    inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
    inv_yhat = scaler.inverse_transform(inv_yhat)
    inv_yhat = inv_yhat[:,0]
    # invert scaling for actual
    test_y = test_y.reshape((len(test_y), 1))
    inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
    inv_y = scaler.inverse_transform(inv_y)
    inv_y = inv_y[:,0]
    # calculate RMSE
    rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
    
    # add to dict
    RMSE_dict[index] = rmse
    print(index, rmse, '\n\n')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


4094 0.01574812375470613 


1304 0.01375975528111553 


9498 0.01716794152848968 


13592 0.008334814105559068 


5407 0.03232081674842209 


1309 0.011330410581864027 


13595 0.019055123454333054 


5406 0.015154494050675739 


1308 0.010625671299114223 


13594 0.009154278780629568 


9496 0.02605907012633828 


9503 0.01528579325101723 


13597 0.016915390575624516 


1307 0.01828066055787264 


5401 0.01560238975519839 


9502 0.01793348217682062 


13596 0.03784800447801249 


1306 0.020039287710179134 


5400 0.02622686461117565 


13599 0.016086590581232997 


9501 0.027497741375498305 


5403 0.02438659314711813 


1305 0.019559322984032488 


13598 0.007851434710753242 


9500 0.014430819254912785 


5404 0.01464685640146835 


1310 0.011474942902526706 


13593 0.01954480014095819 


9510 0.020751265907128405 


1317 0.018284805801024498 


13603 0.011697109675916431 


9505 0.20345069376620253 


5414 0.018844236918582717 


1316 0.018613984073687282 


13602 0.011724384388

In [None]:
import csv

with open('rmse_dict.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in rmse_dict.items():
       writer.writerow([key, value])

In [5]:
RMSE_dict = {}

In [9]:
4094 0.01574812375470613 


1304 0.01375975528111553 


9498 0.01716794152848968 


13592 0.008334814105559068 


5407 0.03232081674842209 


1309 0.011330410581864027 


13595 0.019055123454333054 


5406 0.015154494050675739 


1308 0.010625671299114223 


13594 0.009154278780629568 


9496 0.02605907012633828 


9503 0.01528579325101723 


13597 0.016915390575624516 


1307 0.01828066055787264 


5401 0.01560238975519839 


9502 0.01793348217682062 


13596 0.03784800447801249 


1306 0.020039287710179134 


5400 0.02622686461117565 


13599 0.016086590581232997 


9501 0.027497741375498305 


5403 0.02438659314711813 


1305 0.019559322984032488 


13598 0.007851434710753242 


9500 0.014430819254912785 


5404 0.01464685640146835 


1310 0.011474942902526706 


13593 0.01954480014095819 


9510 0.020751265907128405 


1317 0.018284805801024498 


13603 0.011697109675916431 


9505 0.20345069376620253 


5414 0.018844236918582717 


1316 0.018613984073687282 


13602 0.011724384388829457 


9511 0.024083959996470432 


13605 0.008779784527565337 


1315 0.018867872907645796 


5409 0.016595913122149773 


13604 0.018829630875862396 


9499 0.1734942218886099 


1314 0.013979718940633315 


5408 0.012520803429303529 


13607 0.010826941156570815 


9509 0.019831322967321496 


5411 0.013869347979086474 


13606 0.01738985470172651 


5410 0.014764058758373303 


1312 0.023371267612485782 


1311 0.019229929401254443 


5405 0.008744687821839814 


5402 0.008701658932329524 


5397 0.037666457564036304 


9506 0.03429093139139663 


9491 0.037419983063418864 


9483 0.019871250541209792 


13577 0.01900282167425265 


1294 0.014781883977918528 


5388 0.012119754281787735 


9482 0.01926104982162624 


13576 0.02623647725033016 


5391 0.008256510616624064 


1293 0.011386644498524091 


13579 0.034586915781305586 


9481 0.021074880498107372 


5390 0.019688900584827043 


1292 0.028218891377770797 


13578 0.014039677550439026 


9487 0.01940935362731919 


13581 0.01371554389269703 


1291 0.013694326452235506 


5385 0.014323370411126553 


9486 0.01157189028569844 


13580 0.005008348494933111 


1290 0.020651115827464345 


5384 0.015381214633502189 


13583 0.036550612972826545 


9485 0.011576899172264633 


5389 0.021132449186184697 


1296 0.016735070382702374 


5394 0.013104139197847598 


13586 0.015492951380847522 


13585 0.013732498480666289 


5396 0.01664968136705912 


9490 0.017526149068652574 


13584 0.027235729928011064 


5399 0.015572696928971962 


1301 0.018419586400850164 


13587 0.015466946326779094 


9489 0.023153134650283617 


5398 0.012749418516805633 


1300 0.016764997747258297 


9495 0.009245783523595763 


9492 0.013127362126179645 


13589 0.011864549294265885 


1299 0.007626763366940786 


5393 0.014247513833641251 


9494 0.008665262220894298 


13588 0.019355278474919507 


1298 0.015281101126435657 


5392 0.010574877162503958 


9493 0.017182750716290857 


5395 0.015143425190280217 


1297 0.013165129318894254 


13600 0.027086660797022155 


5412 0.012121118174587319 


1289 0.01486350567506645 


13619 0.01423499303313132 


9528 0.013796857720514906 


9535 0.012660450362530912 


13629 0.014676994065849052 


1339 0.03848514352275813 


5433 0.011982552514844718 


9534 0.013862954168500032 


1338 0.014318773996305153 


5432 0.011446953097909702 


13631 0.025467075509698043 


9533 0.021742648045327422 


13630 0.018949905587173233 


9532 0.01092478348700186 


5434 0.018706323247827315 


1336 0.017391635331341658 


1335 0.013608974063759013 


5429 0.024982681599554465 


9523 0.022969839034164583 


13617 0.010128345592068048 


1334 0.01631836445013318 


5428 0.009358829345662604 


9522 0.0380627809332712 


13616 0.024305404621698237 


5431 0.01605788012361629 


13626 0.015826178128550673 


1340 0.0160424362057772 


5438 0.010513768649599268 


9540 0.03318443543689712 


13637 0.016778958823522247 


1347 0.016136345287474 


5441 0.012792526280235092 


13636 0.021523225131172746 


1346 0.015382175351415374 


5440 0.017494921164142432 


13639 0.02769395075848001 


9541 0.012118733663012152 


1345 0.019850851828770083 


13638 0.02299080920074011 


5442 0.034109435090384714 


9529 0.012221664360309581 


1344 0.1466157641851686 


1343 0.010452790491020212 


5437 0.008621013267635756 


9531 0.013848511198012997 


1342 0.016686160854924054 


5436 0.01894029499865577 


9530 0.01465733178641245 


5439 0.016885220437132385 


1341 0.016665255855041 


13627 0.024834609710089574 


1333 0.028314608377648216 


9521 0.017398739299920845 


1318 0.00979506183616501 


5430 0.010214244610586815 


1325 0.023636356789448407 


13611 0.007897820556423964 


9513 0.017617297896443184 


5422 0.01849094534724276 


1324 0.05036726574664623 


13610 0.013064824671993129 


9519 0.020794926354411453 


13613 0.0033952509376554245 


1323 0.011551761217396565 


5417 0.011902700296397447 


9518 0.02757883239674988 


13612 0.010319525360371722

SyntaxError: EOL while scanning string literal (<ipython-input-9-be70b07a3018>, line 1)