In [4]:
import numpy as np
import pandas as pd
import geohash
import gc
from geopy import distance
from tqdm import tqdm_notebook as tqdm

In [152]:
df = pd.read_csv('data/formatted_data.csv')

In [3]:
def get_distance_from_poi(gh):
    POI1 = (-5.35308837890625, 90.8184814453125)
    POI2 = (-5.32012939453125, 90.6866455078125)
    POI3 = (-5.28167724609375, 90.9173583984375)
    POI4 = (-5.45745849609375, 90.6866455078125)
    coor = geohash.decode(gh)
    return (
        distance.distance(POI1, coor).km, 
        distance.distance(POI2, coor).km,
        distance.distance(POI3, coor).km,
        distance.distance(POI4, coor).km,
    )

In [4]:
MAX_ORDER = 5856
GEOHASH_LIST = df.geohash6.unique()

PREV_DAY = 7 # use previous 7 days data
CNN_OFFSET = 8

RNN_OFFSET = 11 # 3 hours gap

LABELS_OFFSET = 5 # (T+5)

MIN_ORDER_VAL = CNN_OFFSET + PREV_DAY*96

MAX_ORDER_VAL = MAX_ORDER - LABELS_OFFSET

In [5]:
formatted_dict = {}
index = 0
for ind in tqdm(range(len(GEOHASH_LIST))):
    
    i = ind
    gh = GEOHASH_LIST[i]
    temp_df = df[df.geohash6 == gh]
    temp_df.set_index('order', inplace=True)
    temp_dict = temp_df.to_dict('index')
    distance_poi1, distance_poi2, distance_poi3, distance_poi4 = get_distance_from_poi(gh)
    min_distance_from_poi = min([distance_poi1, distance_poi2, distance_poi3, distance_poi4])
    for order in range(MIN_ORDER_VAL, MAX_ORDER_VAL):

        formatted_dict[index] = {}
        
        formatted_dict[index]['order'] = order
        
        formatted_dict[index]['geohash6'] = gh
                
        formatted_dict[index]['hour'] = temp_dict[order]['hour'] # please one-hot encode this
        
        formatted_dict[index]['distance_poi1'] = distance_poi1
        
        formatted_dict[index]['distance_poi2'] = distance_poi2
        
        formatted_dict[index]['distance_poi3'] = distance_poi3
        
        formatted_dict[index]['distance_poi4'] = distance_poi4
        
        formatted_dict[index]['min_distance_poi'] = min_distance_from_poi
        
        for day in range(1, PREV_DAY + 1):
            
            for cnn_off in range(CNN_OFFSET, 0, -1):
                formatted_dict[index] \
                    ['demand@({0}-{1})'.format(day, cnn_off)] = temp_dict[order-cnn_off-day*96]['demand']
                
            formatted_dict[index]['demand@({0}0)'.format(day)] = temp_dict[order-day*96]['demand']
            
            for cnn_off in range(1, CNN_OFFSET + 1):
                formatted_dict[index] \
                    ['demand@({0}+{1})'.format(day, cnn_off)] = temp_dict[order+cnn_off-day*96]['demand']
                
        day = 0
        for rnn_off in range(RNN_OFFSET, -1, -1):
            formatted_dict[index] \
                ['demand@({0}-{1})'.format(day, rnn_off)] = temp_dict[order-rnn_off-day*96]['demand']
            
        for out_off in range(1, LABELS_OFFSET+1):
            formatted_dict[index] \
                ['demand@({0}+{1})'.format(day, out_off)] = temp_dict[order+out_off-day*96]['demand']
            
        index += 1
        
    del temp_df
    del temp_dict
    gc.collect()

HBox(children=(IntProgress(value=0, max=1329), HTML(value='')))




In [6]:
csv_file = open('data/d2/123_formatted_time_series_data_with_order.csv', 'w')

headers = formatted_dict[0].keys()

csv_file.write(','.join(headers))
csv_file.write('\n')

# for ind in tqdm(range(len(formatted_dict.keys()))):
#     i = formatted_dict[formatted_dict.keys()[ind]]

for i in formatted_dict:
    if i % 100_000 == 0:
        print(i)
    temp_list = []
    for h in headers:
        temp_list.append(str(formatted_dict[i][h]))
    csv_file.write(','.join(temp_list))
    csv_file.write('\n')

csv_file.close()

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000


In [7]:
del df
del formatted_dict
gc.collect()

47

In [2]:
df = pd.read_csv('data/d2/123_formatted_time_series_data_with_order.csv')

In [3]:
# day > 50 as our validation set
df[df.order <= 50*96].to_csv('data/d2/123_train_formatted_time_series_data.csv', index=False)
df[df.order > 50*96].to_csv('data/d2/123_eval_formatted_time_series_data.csv', index=False)

In [None]:
del df
gc.collect()

In [5]:
train_df = pd.read_csv('data/d2/123_train_formatted_time_series_data.csv')
eval_df = pd.read_csv('data/d2/123_eval_formatted_time_series_data.csv')

In [6]:
one_hot = pd.get_dummies(train_df['hour'], prefix='hour')
train_df = train_df.drop('hour',axis = 1)
train_df = train_df.join(one_hot)

In [7]:
one_hot = pd.get_dummies(eval_df['hour'], prefix='hour')
eval_df = eval_df.drop('hour',axis = 1)
eval_df = eval_df.join(one_hot)

In [12]:
# train_df.to_csv('data/d2/123_ohe_train_formatted_time_series_data.csv', index=False)
# eval_df.to_csv('data/d2/123_ohe_eval_formatted_time_series_data.csv', index=False)

In [8]:
MAX_ORDER = 5856
GEOHASH_LIST = train_df.geohash6.unique()

PREV_DAY = 7 # use previous 7 days data
CNN_OFFSET = 8

RNN_OFFSET = 11 # 3 hours gap

LABELS_OFFSET = 5 # (T+5)

MIN_ORDER_VAL = CNN_OFFSET + PREV_DAY*96

MAX_ORDER_VAL = MAX_ORDER - LABELS_OFFSET

In [9]:
target_labels = []
day = 0
for out_off in range(1, LABELS_OFFSET+1):
    target_labels.append('demand@({0}+{1})'.format(day, out_off))
Y_train = train_df[target_labels]
Y_eval = eval_df[target_labels]

In [187]:
cnn_features = []
for day in range(1, PREV_DAY+1):
    for cnn_off in range(CNN_OFFSET, 0, -1):
        cnn_features.append('demand@({0}-{1})'.format(day, cnn_off))
        
    cnn_features.append('demand@({0}0)'.format(day))
    
    for cnn_off in range(1, CNN_OFFSET+1):
        cnn_features.append('demand@({0}+{1})'.format(day, cnn_off))
        
X_cnn_train = train_df[cnn_features]
X_cnn_eval = eval_df[cnn_features]

In [11]:
dnn_features = ['distance_poi1', 'distance_poi2', 'distance_poi3', 'distance_poi4', 'min_distance_poi']
for i in range(24):
    dnn_features.append('hour_{0}'.format(i))
X_dnn_train = train_df[dnn_features]
X_dnn_eval = eval_df[dnn_features]

In [12]:
rnn_features = []
day = 0
for rnn_off in range(RNN_OFFSET, -1, -1):
    rnn_features.append('demand@({0}-{1})'.format(day, rnn_off))
X_rnn_train = train_df[rnn_features]
X_rnn_eval = eval_df[rnn_features]

In [2]:
# import pickle
# with open('./data/d2/123_train.pickle', 'rb') as handle:
#     train_dict = pickle.load(handle)
    
# with open('./data/d2/123_eval.pickle', 'rb') as handle:
#     eval_dict = pickle.load(handle)

In [13]:
from keras.models import (
    Model,
    load_model,
)
from keras.layers import *
from keras.activations import *
from keras.optimizers import *
from keras.losses import (
    mean_absolute_percentage_error,
    mean_squared_error,
)
import tensorflow as tf
print(tf.__version__)

Using TensorFlow backend.


1.14.1-dev20190607


In [12]:
def clipped_relu(x):
    return relu(x, max_value=1.0)

In [207]:
del cnn_inputs, dnn_inputs, rnn_inputs
del cnn, dnn, rnn
del stacked
gc.collect()

383

In [208]:
cnn_inputs = Input(shape=(len(cnn_features),))
dnn_inputs = Input(shape=(len(dnn_features),))
rnn_inputs = Input(shape=(len(rnn_features),))

cnn = Reshape((PREV_DAY, CNN_OFFSET*2+1, ))(cnn_inputs)
'''depending on how you view your data, i prefer it to be (batch, steps, features)'''
cnn = Conv1D(16, 5, activation=relu, padding='same')(cnn)
cnn = MaxPool1D(2, 2, padding='same')(cnn)
cnn = Conv1D(8, 3, activation=relu, padding='same')(cnn)
cnn = MaxPool1D(2, 2, padding='same')(cnn)
cnn = Flatten()(cnn)
cnn = Model(inputs=cnn_inputs, outputs=cnn)

rnn2 = Reshape((PREV_DAY, CNN_OFFSET*2 + 1, ))(cnn_inputs)
rnn2 = GRU(32, return_sequences=True)(rnn2)
rnn2 = GRU(16)(rnn2)
rnn2 = Model(inputs=cnn_inputs, outputs=rnn2)

dnn = Dense(32, activation=relu)(dnn_inputs)
# dnn = Dropout(rate=0.2)(dnn)
dnn = Dense(16, activation=relu)(dnn)
dnn = Model(inputs=dnn_inputs, outputs=dnn)

rnn = Reshape((len(rnn_features), 1, ))(rnn_inputs) # i have only one feature to feed into RNN
rnn = GRU(32, return_sequences=True)(rnn)
rnn = GRU(16)(rnn)
rnn = Model(inputs=rnn_inputs, outputs=rnn)

stacked = concatenate([cnn.output, dnn.output, rnn.output, rnn2.output])

# stacked = BatchNormalization()(stacked)
stacked = Dense(64, activation=relu)(stacked)
# stacked = Dropout(rate=0.4)(stacked)
stacked = Dense(32, activation=relu)(stacked)
# stacked = Dropout(rate=0.3)(stacked)
stacked = Dense(16, activation=relu)(stacked)

'''Linear vs clipped relu vs sigmoid vs hard_sigmoid'''
# stacked = Dense(5, activation=linear)(stacked)
# this is nice, it is linear clipped, but i don't know how to make it work
# stacked = Dense(5, activation=clipped_relu)(stacked)
# stacked = Dense(5, activation=sigmoid)(stacked)
stacked = Dense(5, activation=hard_sigmoid)(stacked)

stacked = Model(inputs=[cnn.input, dnn.input, rnn.input], outputs=stacked)

In [209]:
stacked.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_80 (InputLayer)           (None, 119)          0                                            
__________________________________________________________________________________________________
reshape_64 (Reshape)            (None, 7, 17)        0           input_80[0][0]                   
__________________________________________________________________________________________________
conv1d_57 (Conv1D)              (None, 7, 16)        1376        reshape_64[0][0]                 
__________________________________________________________________________________________________
max_pooling1d_33 (MaxPooling1D) (None, 4, 16)        0           conv1d_57[0][0]                  
__________________________________________________________________________________________________
input_82 (

In [210]:
hp = [
    (0.01, 1),
    (0.001, 1),
    (0.005, 2),
    (0.001, 4),
    (0.0005, 2),
    (0.0001, 2),
]

for lr, ep in hp:
    stacked.compile(
        optimizer=Adagrad(lr=lr),
        loss=mean_squared_error,
        metrics=[mean_absolute_percentage_error],
    )
    
    stacked.fit(
        [X_cnn_train, X_dnn_train, X_rnn_train],
        Y_train,
        batch_size=256,
        epochs=ep,
        validation_data=([X_cnn_eval, X_dnn_eval, X_rnn_eval], Y_eval),
    )

Train on 5476809 samples, validate on 1395450 samples
Epoch 1/1
Train on 5476809 samples, validate on 1395450 samples
Epoch 1/1
Train on 5476809 samples, validate on 1395450 samples
Epoch 1/2
Epoch 2/2
Train on 5476809 samples, validate on 1395450 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Train on 5476809 samples, validate on 1395450 samples
Epoch 1/2
Epoch 2/2
Train on 5476809 samples, validate on 1395450 samples
Epoch 1/2
Epoch 2/2


In [109]:
stacked.save("cnn_rnn_dnn_rnn_stacked_10_06_2019.h5")

In [63]:
stacked.compile(
    optimizer=Adagrad(lr=0.03),
    loss=mean_squared_error,
    metrics=[mean_absolute_percentage_error],
)

In [60]:
stacked.fit(
    [X_cnn_train, X_dnn_train, X_rnn_train],
    Y_train,
    batch_size=256,
    epochs=,
    validation_data=([X_cnn_eval, X_dnn_eval, X_rnn_eval], Y_eval)
)

Train on 5476809 samples, validate on 1395450 samples
Epoch 1/1


<keras.callbacks.History at 0x1637eccc0>

In [None]:
stacked.save("cnn_dnn_rnn_stacked_09_06_2019_4.h5")

In [20]:
loss_and_metrics = stacked.evaluate([X_cnn_eval, X_dnn_eval, X_rnn_eval], Y_eval, batch_size=256)
loss_and_metrics



[0.0008501009462406551, 1172591.7531496603]

In [12]:
train_dict = {
    'cnn': X_cnn_train,
    'dnn': X_dnn_train,
    'rnn': X_rnn_train,
    'label': Y_train,
}

eval_dict = {
    'cnn': X_cnn_eval,
    'dnn': X_dnn_eval,
    'rnn': X_rnn_eval,
    'label': Y_eval,
}

with open('./data/d2/123_train.pickle', 'wb') as handle:
    pickle.dump(train_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('./data/d2/123_eval.pickle', 'wb') as handle:
    pickle.dump(eval_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [102]:
stacked = load_model('cnn_dnn_rnn_stacked_09_06_2019_3.h5')

In [211]:
stacked.predict([X_cnn_eval[300:310], X_dnn_eval[300:310], X_rnn_eval[300:310]])

array([[0.2541865 , 0.2676375 , 0.27181014, 0.28884757, 0.30760366],
       [0.27643868, 0.2872815 , 0.29412508, 0.30842525, 0.3240075 ],
       [0.24497917, 0.25971454, 0.2709876 , 0.29159304, 0.31121558],
       [0.29200315, 0.30243778, 0.3147701 , 0.32960176, 0.3446563 ],
       [0.30414212, 0.31174782, 0.32292545, 0.33544898, 0.34540173],
       [0.32780027, 0.3327244 , 0.34020096, 0.34676278, 0.35551935],
       [0.38001123, 0.38249725, 0.3894377 , 0.3905192 , 0.39882237],
       [0.3552199 , 0.36558527, 0.37793422, 0.390386  , 0.40778542],
       [0.3427139 , 0.35851282, 0.37506768, 0.39656383, 0.4190245 ],
       [0.37691975, 0.39161086, 0.41168392, 0.4321497 , 0.453538  ]],
      dtype=float32)

In [212]:
Y_eval.iloc[300:310]

Unnamed: 0,demand@(0+1),demand@(0+2),demand@(0+3),demand@(0+4),demand@(0+5)
300,0.269682,0.189206,0.29367,0.292639,0.326831
301,0.189206,0.29367,0.292639,0.326831,0.406706
302,0.29367,0.292639,0.326831,0.406706,0.352528
303,0.292639,0.326831,0.406706,0.352528,0.322895
304,0.326831,0.406706,0.352528,0.322895,0.370209
305,0.406706,0.352528,0.322895,0.370209,0.381067
306,0.352528,0.322895,0.370209,0.381067,0.459268
307,0.322895,0.370209,0.381067,0.459268,0.465589
308,0.370209,0.381067,0.459268,0.465589,0.487954
309,0.381067,0.459268,0.465589,0.487954,0.493344


In [213]:
X_rnn_eval.iloc[29]

demand@(0-11)    0.318613
demand@(0-10)    0.293713
demand@(0-9)     0.277147
demand@(0-8)     0.329102
demand@(0-7)     0.308377
demand@(0-6)     0.368274
demand@(0-5)     0.387573
demand@(0-4)     0.332232
demand@(0-3)     0.307907
demand@(0-2)     0.389075
demand@(0-1)     0.345528
demand@(0-0)     0.411676
Name: 29, dtype: float64

In [214]:
X_cnn_eval[25:30]

Unnamed: 0,demand@(1-8),demand@(1-7),demand@(1-6),demand@(1-5),demand@(1-4),demand@(1-3),demand@(1-2),demand@(1-1),demand@(10),demand@(1+1),...,demand@(7-1),demand@(70),demand@(7+1),demand@(7+2),demand@(7+3),demand@(7+4),demand@(7+5),demand@(7+6),demand@(7+7),demand@(7+8)
25,0.261218,0.280608,0.309196,0.375059,0.347046,0.322985,0.340132,0.392787,0.427632,0.419994,...,0.296093,0.31459,0.37396,0.348863,0.312148,0.311417,0.308947,0.406285,0.37367,0.406333
26,0.280608,0.309196,0.375059,0.347046,0.322985,0.340132,0.392787,0.427632,0.419994,0.399908,...,0.31459,0.37396,0.348863,0.312148,0.311417,0.308947,0.406285,0.37367,0.406333,0.549123
27,0.309196,0.375059,0.347046,0.322985,0.340132,0.392787,0.427632,0.419994,0.399908,0.363887,...,0.37396,0.348863,0.312148,0.311417,0.308947,0.406285,0.37367,0.406333,0.549123,0.661163
28,0.375059,0.347046,0.322985,0.340132,0.392787,0.427632,0.419994,0.399908,0.363887,0.376743,...,0.348863,0.312148,0.311417,0.308947,0.406285,0.37367,0.406333,0.549123,0.661163,0.671647
29,0.347046,0.322985,0.340132,0.392787,0.427632,0.419994,0.399908,0.363887,0.376743,0.411826,...,0.312148,0.311417,0.308947,0.406285,0.37367,0.406333,0.549123,0.661163,0.671647,0.646513


In [215]:
stacked.predict([X_cnn_eval[25:30], X_dnn_eval[25:30], X_rnn_eval[25:30]])

array([[0.3739043 , 0.37927967, 0.39023626, 0.3994818 , 0.40683833],
       [0.35253328, 0.3610596 , 0.37091228, 0.38203496, 0.39531517],
       [0.39285436, 0.40234578, 0.41778904, 0.43090335, 0.44566423],
       [0.37204707, 0.38743484, 0.4099395 , 0.43080065, 0.45383987],
       [0.41780695, 0.43502188, 0.46457615, 0.4901215 , 0.5133593 ]],
      dtype=float32)

In [216]:
Y_eval.iloc[25:30]

Unnamed: 0,demand@(0+1),demand@(0+2),demand@(0+3),demand@(0+4),demand@(0+5)
25,0.307907,0.389075,0.345528,0.411676,0.494835
26,0.389075,0.345528,0.411676,0.494835,0.483818
27,0.345528,0.411676,0.494835,0.483818,0.658671
28,0.411676,0.494835,0.483818,0.658671,0.740603
29,0.494835,0.483818,0.658671,0.740603,0.843992


In [217]:
stacked.predict([X_cnn_eval[125:135], X_dnn_eval[125:135], X_rnn_eval[125:135]])

array([[0.5240963 , 0.54302776, 0.5796855 , 0.6072405 , 0.62893593],
       [0.5523381 , 0.5709355 , 0.60726094, 0.63434434, 0.65402985],
       [0.56316113, 0.5815647 , 0.6155863 , 0.64071435, 0.6612387 ],
       [0.5930171 , 0.6110299 , 0.64555275, 0.6690184 , 0.68900305],
       [0.62114555, 0.63863915, 0.66990227, 0.6903484 , 0.71029437],
       [0.67731756, 0.69916046, 0.7271508 , 0.7510941 , 0.7744848 ],
       [0.74033266, 0.7661967 , 0.79947734, 0.828082  , 0.85329926],
       [0.8318206 , 0.8489542 , 0.88036513, 0.8959981 , 0.9085451 ],
       [0.96503365, 0.96415144, 1.        , 0.99633217, 0.9833094 ],
       [1.        , 0.99386895, 1.        , 1.        , 1.        ]],
      dtype=float32)

In [218]:
Y_eval.iloc[125:135]

Unnamed: 0,demand@(0+1),demand@(0+2),demand@(0+3),demand@(0+4),demand@(0+5)
125,0.540479,0.523461,0.548006,0.588167,0.663514
126,0.523461,0.548006,0.588167,0.663514,0.739084
127,0.548006,0.588167,0.663514,0.739084,0.840679
128,0.588167,0.663514,0.739084,0.840679,1.0
129,0.663514,0.739084,0.840679,1.0,1.0
130,0.739084,0.840679,1.0,1.0,1.0
131,0.840679,1.0,1.0,1.0,1.0
132,1.0,1.0,1.0,1.0,1.0
133,1.0,1.0,1.0,1.0,1.0
134,1.0,1.0,1.0,1.0,1.0


In [219]:
stacked.predict([X_cnn_eval[-10:], X_dnn_eval[-10:], X_rnn_eval[-10:]])

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]], dtype=float32)

In [220]:
Y_eval.tail(10)

Unnamed: 0,demand@(0+1),demand@(0+2),demand@(0+3),demand@(0+4),demand@(0+5)
1395440,0.0,0.0,0.0,0.0,0.0
1395441,0.0,0.0,0.0,0.0,0.0
1395442,0.0,0.0,0.0,0.0,0.0
1395443,0.0,0.0,0.0,0.0,0.0
1395444,0.0,0.0,0.0,0.0,0.0
1395445,0.0,0.0,0.0,0.0,0.0
1395446,0.0,0.0,0.0,0.0,0.0
1395447,0.0,0.0,0.0,0.0,0.0
1395448,0.0,0.0,0.0,0.0,0.0
1395449,0.0,0.0,0.0,0.0,0.0


In [221]:
stacked.predict([X_cnn_eval[:5], X_dnn_eval[:5], X_rnn_eval[:5]])

array([[0.2184219 , 0.22323072, 0.24155346, 0.25602198, 0.25786874],
       [0.21673015, 0.2197353 , 0.23252597, 0.24192938, 0.24345204],
       [0.22830278, 0.22634163, 0.23209286, 0.2343044 , 0.2292926 ],
       [0.23349893, 0.23066747, 0.23215142, 0.23120135, 0.22652161],
       [0.24018976, 0.23645833, 0.23218372, 0.22796145, 0.22334662]],
      dtype=float32)

In [222]:
Y_eval.head()

Unnamed: 0,demand@(0+1),demand@(0+2),demand@(0+3),demand@(0+4),demand@(0+5)
0,0.184645,0.206627,0.232386,0.255655,0.24796
1,0.206627,0.232386,0.255655,0.24796,0.275323
2,0.232386,0.255655,0.24796,0.275323,0.238671
3,0.255655,0.24796,0.275323,0.238671,0.179991
4,0.24796,0.275323,0.238671,0.179991,0.191984


In [223]:
preds = stacked.predict([X_cnn_eval, X_dnn_eval, X_rnn_eval])
count_rmse(preds, Y_eval)

demand@(0+1)    0.024055
demand@(0+2)    0.027821
demand@(0+3)    0.030030
demand@(0+4)    0.031813
demand@(0+5)    0.033351
dtype: float64

In [52]:
# qp03wc
# qp03wc_train_df = train_df[train_df.geohash6 == 'qp03wc']
qp03wc_eval_df = eval_df[eval_df.geohash6 == 'qp03wc']

In [54]:
qp03wc_eval_df.columns

Index(['order', 'geohash6', 'distance_poi1', 'distance_poi2', 'distance_poi3',
       'distance_poi4', 'min_distance_poi', 'demand@(1-8)', 'demand@(1-7)',
       'demand@(1-6)',
       ...
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_20', 'hour_21', 'hour_22', 'hour_23'],
      dtype='object', length=167)

In [115]:
cnn_features, dnn_features, rnn_features

(['demand@(1-8)',
  'demand@(1-7)',
  'demand@(1-6)',
  'demand@(1-5)',
  'demand@(1-4)',
  'demand@(1-3)',
  'demand@(1-2)',
  'demand@(1-1)',
  'demand@(10)',
  'demand@(1+8)',
  'demand@(1+7)',
  'demand@(1+6)',
  'demand@(1+5)',
  'demand@(1+4)',
  'demand@(1+3)',
  'demand@(1+2)',
  'demand@(1+1)',
  'demand@(2-8)',
  'demand@(2-7)',
  'demand@(2-6)',
  'demand@(2-5)',
  'demand@(2-4)',
  'demand@(2-3)',
  'demand@(2-2)',
  'demand@(2-1)',
  'demand@(20)',
  'demand@(2+8)',
  'demand@(2+7)',
  'demand@(2+6)',
  'demand@(2+5)',
  'demand@(2+4)',
  'demand@(2+3)',
  'demand@(2+2)',
  'demand@(2+1)',
  'demand@(3-8)',
  'demand@(3-7)',
  'demand@(3-6)',
  'demand@(3-5)',
  'demand@(3-4)',
  'demand@(3-3)',
  'demand@(3-2)',
  'demand@(3-1)',
  'demand@(30)',
  'demand@(3+8)',
  'demand@(3+7)',
  'demand@(3+6)',
  'demand@(3+5)',
  'demand@(3+4)',
  'demand@(3+3)',
  'demand@(3+2)',
  'demand@(3+1)',
  'demand@(4-8)',
  'demand@(4-7)',
  'demand@(4-6)',
  'demand@(4-5)',
  'demand@(4-

In [117]:
cnn_inputs = Input(shape=(len(cnn_features),))
dnn_inputs = Input(shape=(len(dnn_features),))
rnn_inputs = Input(shape=(len(rnn_features),))

cnn = Reshape((PREV_DAY, CNN_OFFSET*2+1, ))(cnn_inputs)
'''depending on how you view your data, i prefer it to be (batch, steps, channels/features)'''
cnn = Conv1D(16, 5, activation=relu, padding='same')(cnn)
cnn = Conv1D(16, 3, activation=relu, padding='same')(cnn)
cnn = MaxPool1D(2, 2, padding='same')(cnn)
cnn = Conv1D(32, 3, activation=relu, padding='same')(cnn)
cnn = Conv1D(32, 2, activation=relu, padding='same')(cnn)
cnn = MaxPool1D(2, 2, padding='same')(cnn)
cnn = Conv1D(32, 3, activation=relu, padding='same')(cnn)
cnn = Conv1D(32, 2, activation=relu, padding='same')(cnn)
cnn = MaxPool1D(2, 2, padding='same')(cnn)
cnn = Flatten()(cnn)
cnn = Dropout(0.4)(cnn)
cnn = Dense(128, activation=relu)(cnn)
cnn = Dense(16, activation=relu)(cnn)
cnn = Model(inputs=cnn_inputs, outputs=cnn)

rnn2 = Reshape((PREV_DAY, CNN_OFFSET*2 + 1, ))(cnn_inputs)
rnn2 = GRU(32, return_sequences=True)(rnn2)
rnn2 = GRU(16)(rnn2)
rnn2 = Dropout(0.4)(rnn2)
rnn2 = Dense(128, activation=relu)(rnn2)
rnn2 = Dense(16, activation=relu)(rnn2)
rnn2 = Model(inputs=cnn_inputs, outputs=rnn2)

dnn = Dense(128, activation=relu)(dnn_inputs)
dnn = Dropout(rate=0.2)(dnn)
dnn = Dense(16, activation=relu)(dnn)
dnn = Model(inputs=dnn_inputs, outputs=dnn)

rnn = Reshape((len(rnn_features), 1, ))(rnn_inputs) # i have only one feature to feed into RNN
rnn = GRU(32, return_sequences=True)(rnn)
rnn = GRU(16)(rnn)
rnn = Dropout(0.4)(rnn)
rnn = Dense(128, activation=relu)(rnn)
rnn = Dense(16, activation=relu)(rnn)
rnn = Model(inputs=rnn_inputs, outputs=rnn)

stacked = concatenate([cnn.output, dnn.output, rnn.output, rnn2.output])

stacked = BatchNormalization()(stacked)
stacked = Dropout(0.5)(stacked)
stacked = Dense(512, activation=relu)(stacked)
stacked = Dropout(rate=0.4)(stacked)
stacked = Dense(128, activation=relu)(stacked)
# stacked = Dropout(rate=0.3)(stacked)
stacked = Dense(16, activation=relu)(stacked)

'''Linear vs clipped relu vs sigmoid vs hard_sigmoid'''
# stacked = Dense(5, activation=linear)(stacked)
# this is nice, it is linear clipped, but i don't know how to make it work
# stacked = Dense(5, activation=clipped_relu)(stacked)
# stacked = Dense(5, activation=sigmoid)(stacked)
stacked = Dense(5, activation=hard_sigmoid)(stacked)

stacked = Model(inputs=[cnn.input, dnn.input, rnn.input], outputs=stacked)

In [118]:
stacked.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_49 (InputLayer)           (None, 119)          0                                            
__________________________________________________________________________________________________
reshape_33 (Reshape)            (None, 7, 17)        0           input_49[0][0]                   
__________________________________________________________________________________________________
conv1d_25 (Conv1D)              (None, 7, 16)        1376        reshape_33[0][0]                 
__________________________________________________________________________________________________
conv1d_26 (Conv1D)              (None, 7, 16)        784         conv1d_25[0][0]                  
__________________________________________________________________________________________________
max_poolin

In [119]:
# BEST LEARNING RATE  ABOUT 0.01 to 0.0001
hp = [
    (0.03, 1),
    (0.001, 1),
    (0.005, 2),
    (0.001, 4),
    (0.0005, 2),
    (0.0001, 2),
]

for lr, ep in hp:
    stacked.compile(
        optimizer=Adagrad(lr=lr),
        loss=mean_squared_error,
        metrics=[mean_absolute_percentage_error],
    )
    
    stacked.fit(
        [X_cnn_train, X_dnn_train, X_rnn_train],
        Y_train,
        batch_size=256,
        epochs=ep,
        validation_data=([X_cnn_eval, X_dnn_eval, X_rnn_eval], Y_eval),
    )

Train on 5476809 samples, validate on 1395450 samples
Epoch 1/1
 405248/5476809 [=>............................] - ETA: 25:48 - loss: 0.0190 - mean_absolute_percentage_error: 134576.8789

E0610 20:12:10.476016 4441658816 ultratb.py:149] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/anaconda3/envs/grab/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-119-c88c95d98cf3>", line 23, in <module>
    validation_data=([X_cnn_eval, X_dnn_eval, X_rnn_eval], Y_eval),
  File "/anaconda3/envs/grab/lib/python3.7/site-packages/keras/engine/training.py", line 1039, in fit
    validation_steps=validation_steps)
  File "/anaconda3/envs/grab/lib/python3.7/site-packages/keras/engine/training_arrays.py", line 199, in fit_loop
    outs = f(ins_batch)
  File "/anaconda3/envs/grab/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py", line 2715, in __call__
    return self._call(inputs)
  File "/anaconda3/envs/grab/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py", line 2675, in _call
    fetched = self._callable_fn(*array_vals)
  File "/anaconda3/envs/grab/lib/python3.7/site-packages/tensorflow/python/cli

KeyboardInterrupt: 

In [None]:
del cnn_inputs, dnn_inputs, rnn_inputs
del cnn, dnn, rnn
del stacked
gc.collect()

In [142]:
cnn_inputs = Input(shape=(len(cnn_features),))
dnn_inputs = Input(shape=(len(dnn_features),))
rnn_inputs = Input(shape=(len(rnn_features),))

cnn = Reshape((PREV_DAY, CNN_OFFSET*2+1, ))(cnn_inputs)
'''depending on how you view your data, i prefer it to be (batch, steps, features)'''
cnn = Conv1D(16, 5, activation='relu', padding='same')(cnn)
cnn = MaxPool1D(2, 2, padding='same')(cnn)
cnn = Conv1D(8, 3, activation='relu', padding='same')(cnn)
cnn = MaxPool1D(2, 2, padding='same')(cnn)
cnn = Flatten()(cnn)
cnn = Model(inputs=cnn_inputs, outputs=cnn)

rnn2 = Reshape((PREV_DAY, CNN_OFFSET*2 + 1, ))(cnn_inputs)
rnn2 = GRU(32, return_sequences=True)(rnn2)
rnn2 = GRU(16)(rnn2)
rnn2 = Model(inputs=cnn_inputs, outputs=rnn2)

dnn = Dense(32, activation=relu)(dnn_inputs)
# dnn = Dropout(rate=0.2)(dnn)
dnn = Dense(16, activation=relu)(dnn)
dnn = Model(inputs=dnn_inputs, outputs=dnn)

rnn = Reshape((len(rnn_features), 1, ))(rnn_inputs) # i have only one feature to feed into RNN
rnn = GRU(32, return_sequences=True)(rnn)
rnn = GRU(16)(rnn)
rnn = Model(inputs=rnn_inputs, outputs=rnn)

stacked = concatenate([cnn.output, dnn.output, rnn.output, rnn2.output])

# stacked = BatchNormalization()(stacked)
stacked = Dense(64, activation=relu)(stacked)
# stacked = Dropout(rate=0.4)(stacked)
stacked = Dense(32, activation=relu)(stacked)
# stacked = Dropout(rate=0.3)(stacked)
stacked = Dense(16, activation=relu)(stacked)

'''Linear vs clipped relu vs sigmoid vs hard_sigmoid'''
# stacked = Dense(5, activation=linear)(stacked)
# this is nice, it is linear clipped, but i don't know how to make it work
# stacked = Dense(5, activation=clipped_relu)(stacked)
# stacked = Dense(5, activation=sigmoid)(stacked)
stacked = Dense(5, activation=hard_sigmoid)(stacked)

stacked = Model(inputs=[cnn.input, dnn.input, rnn.input], outputs=stacked)

In [143]:
stacked.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_55 (InputLayer)           (None, 119)          0                                            
__________________________________________________________________________________________________
reshape_39 (Reshape)            (None, 7, 17)        0           input_55[0][0]                   
__________________________________________________________________________________________________
conv1d_33 (Conv1D)              (None, 7, 16)        1376        reshape_39[0][0]                 
__________________________________________________________________________________________________
max_pooling1d_9 (MaxPooling1D)  (None, 4, 16)        0           conv1d_33[0][0]                  
__________________________________________________________________________________________________
input_57 (

In [144]:
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [145]:
# BEST LEARNING RATE  ABOUT 0.01 to 0.0001
hp = [
    (0.03, 1),
    (0.001, 1),
    (0.005, 2),
    (0.001, 4),
    (0.0005, 2),
    (0.0001, 2),
]

for lr, ep in hp:
    stacked.compile(
        optimizer=RMSprop(lr=lr),
        loss=root_mean_squared_error,
        metrics=[mean_absolute_percentage_error],
    )
    
    stacked.fit(
        [X_cnn_train, X_dnn_train, X_rnn_train],
        Y_train,
        batch_size=256,
        epochs=ep,
        validation_data=([X_cnn_eval, X_dnn_eval, X_rnn_eval], Y_eval),
    )

Train on 5476809 samples, validate on 1395450 samples
Epoch 1/1
 703232/5476809 [==>...........................] - ETA: 10:15 - loss: 0.1357 - mean_absolute_percentage_error: 167736.5576

KeyboardInterrupt: 

In [149]:
stacked = load_model('./cnn_rnn_dnn_rnn_stacked_10_06_2019.h5')

In [124]:
def count_rmse(predictions, targets):
    return np.sqrt(np.mean((predictions-targets)**2))

In [150]:
preds = stacked.predict([X_cnn_eval, X_dnn_eval, X_rnn_eval])

In [151]:
count_rmse(preds, Y_eval)

demand@(0+1)    0.023942
demand@(0+2)    0.027729
demand@(0+3)    0.029978
demand@(0+4)    0.031624
demand@(0+5)    0.033216
dtype: float64

In [157]:
len(cnn_features),len(dnn_features),len(rnn_features)

(119, 29, 12)

In [167]:
del cnn
gc.collect()

171

In [174]:
cnn_inputs = Input(shape=(len(cnn_features),))

cnn = Reshape((PREV_DAY, CNN_OFFSET*2 + 1, ))(cnn_inputs)
cnn = GRU(32, return_sequences=True)(cnn)
cnn = GRU(16)(cnn)

cnn = Dense(64, activation=relu)(cnn)
cnn = Dense(32, activation=relu)(cnn)
cnn = Dense(16, activation=relu)(cnn)
cnn = Dense(5, activation=hard_sigmoid)(cnn)

cnn = Model(inputs=cnn_inputs, outputs=cnn)

In [175]:
cnn.compile(
    optimizer=Adagrad(lr=0.0003),
    loss=mean_squared_error,
    metrics=[mean_absolute_percentage_error],
)

cnn.fit(
    X_cnn_train,
    Y_train,
    batch_size=256,
    epochs=1,
    validation_data=(X_cnn_eval, Y_eval),
)

Train on 5476809 samples, validate on 1395450 samples
Epoch 1/1
 736256/5476809 [===>..........................] - ETA: 5:37 - loss: 0.1253 - mean_absolute_percentage_error: 174960898.5007

KeyboardInterrupt: 

In [176]:
preds = cnn.predict(X_cnn_eval.iloc[:5000])

In [177]:
preds

array([[0.        , 0.        , 0.01658908, 0.        , 0.        ],
       [0.        , 0.        , 0.01594695, 0.        , 0.        ],
       [0.        , 0.        , 0.01508364, 0.        , 0.        ],
       ...,
       [0.06107077, 0.08459067, 0.11330804, 0.        , 0.        ],
       [0.05485138, 0.07905954, 0.10699213, 0.        , 0.        ],
       [0.0517309 , 0.07629088, 0.10326871, 0.        , 0.        ]],
      dtype=float32)

In [153]:
df.head()

Unnamed: 0,order,geohash6,hour,demand
0,0,qp03wc,0,0.054858
1,1,qp03wc,0,0.086209
2,2,qp03wc,0,0.050739
3,3,qp03wc,0,0.075174
4,4,qp03wc,1,0.062867


In [155]:
df[df.order > 50*96].drop(columns=['order']).to_csv('data/test_raw.csv', index=False)