In [32]:
import numpy as np
import pandas as pd
import geohash
import gc
from geopy import distance

from keras.models import *
from keras.layers import *
from keras.activations import *
from keras.optimizers import *
from keras.losses import *
import tensorflow as tf
print(tf.__version__)

1.14.1-dev20190607


In [6]:
df = pd.read_csv('./data/formatted_time_series_2.csv')
df.to_pickle('./data/raw_df.pkl')
train_df = df[df.order <= 50*96]
train_df.to_pickle('./data/train_df.pkl')
eval_df = df[df.order > 50*96]
eval_df.to_pickle('./data/eval_df.pkl')

In [6]:
PREV_DAY = 7 # use previous 7 days data
PREV_DAY_PRE_OFFSET = 7 # 2 hours
PREV_DAY_POST_OFFSET = 8 # 2 hours
RNN_OFFSET = 11 # 3 hours gap
LABELS_OFFSET = 5 # (T+5)
MIN_ORDER_VAL = PREV_DAY_PRE_OFFSET + PREV_DAY*96
MAX_ORDER_VAL = MAX_ORDER - LABELS_OFFSET
POI_GEOHASH = ['qp09d8', 'qp03xx', 'qp03wf']

In [17]:
def get_distance_by_gh(gh1, gh2):
    coor1 = geohash.decode(gh1)
    coor2 = geohash.decode(gh2)
    return distance.distance(coor1, coor2).km

In [27]:
train_df['distance_from_qp09d8'] = train_df.geohash6.apply(lambda x: get_distance_by_gh(x, 'qp09d8'))
train_df['distance_from_qp03xx'] = train_df.geohash6.apply(lambda x: get_distance_by_gh(x, 'qp03xx'))
train_df['distance_from_qp03wf'] = train_df.geohash6.apply(lambda x: get_distance_by_gh(x, 'qp03wf'))

In [28]:
eval_df['distance_from_qp09d8'] = eval_df.geohash6.apply(lambda x: get_distance_by_gh(x, 'qp09d8'))
eval_df['distance_from_qp03xx'] = eval_df.geohash6.apply(lambda x: get_distance_by_gh(x, 'qp03xx'))
eval_df['distance_from_qp03wf'] = eval_df.geohash6.apply(lambda x: get_distance_by_gh(x, 'qp03wf'))

In [38]:
train_df.to_pickle('./data/train_df_with_poi_distance.pkl')
eval_df.to_pickle('./data/eval_df_with_poi_distance.pkl')

In [39]:
del train_df
del eval_df
gc.collect()

185

In [2]:
train_df = pd.read_pickle('./data/train_df_with_poi_distance.pkl')
eval_df = pd.read_pickle('./data/eval_df_with_poi_distance.pkl')

In [13]:
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

def count_rmse(predictions, targets):
    return np.sqrt(np.mean((predictions-targets)**2))

In [14]:
# RAW DNN
target_labels = []
day = 0
for out_off in range(1, LABELS_OFFSET+1):
    target_labels.append('demand@({0}+{1})'.format(day, out_off))

to_drop = target_labels + ['geohash6'] + ['order']
X_train = train_df.drop(columns=to_drop)
X_eval = eval_df.drop(columns=to_drop)
Y_train = train_df[target_labels]
Y_eval = eval_df[target_labels]

dnn_features = X_train.columns

In [23]:
dnn_inputs = Input(shape=(len(dnn_features),))

dnn = Dense(256, activation=relu)(dnn_inputs)
dnn = Dense(128, activation=relu)(dnn)
dnn = Dense(64, activation=relu)(dnn)
dnn = Dense(32, activation=relu)(dnn)
dnn = Dense(5, activation=hard_sigmoid)(dnn)

dnn = Model(inputs=dnn_inputs, outputs=dnn)

dnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 199)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 256)               51200     
_________________________________________________________________
dense_17 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_18 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_19 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_20 (Dense)             (None, 5)                 165       
Total params: 94,597
Trainable params: 94,597
Non-trainable params: 0
_________________________________________________________________


In [24]:
hp = [(0.003, 1),(0.001, 1),(0.005, 2),(0.001, 1),(0.0005, 2),(0.0001, 2)]

for lr, ep in hp:
    dnn.compile(optimizer=Adagrad(lr=lr),loss=mean_squared_error,metrics=[root_mean_squared_error],)
    dnn.fit(X_train,Y_train,batch_size=512,epochs=ep,validation_data=(X_eval, Y_eval))

Train on 5478138 samples, validate on 1395450 samples
Epoch 1/1
Train on 5478138 samples, validate on 1395450 samples
Epoch 1/1
Train on 5478138 samples, validate on 1395450 samples
Epoch 1/2
Epoch 2/2
Train on 5478138 samples, validate on 1395450 samples
Epoch 1/1
Train on 5478138 samples, validate on 1395450 samples
Epoch 1/2
Epoch 2/2
Train on 5478138 samples, validate on 1395450 samples
Epoch 1/2
Epoch 2/2


In [25]:
preds = dnn.predict(X_eval)

In [26]:
count_rmse(preds, Y_eval)

demand@(0+1)    0.024521
demand@(0+2)    0.028006
demand@(0+3)    0.030147
demand@(0+4)    0.031731
demand@(0+5)    0.033152
dtype: float64

In [28]:
prev_six_days_feat = []
for day in range(1, PREV_DAY):
    for cnn_off in range(PREV_DAY_PRE_OFFSET, -1, -1):
        prev_six_days_feat.append('demand@({0}-{1})'.format(day, cnn_off))
    for cnn_off in range(1, LABELS_OFFSET + PREV_DAY_POST_OFFSET + 1):
        prev_six_days_feat.append('demand@({0}+{1})'.format(day, cnn_off))
X_train_six_day = train_df[prev_six_days_feat]
X_eval_six_day = eval_df[prev_six_days_feat]

In [29]:
last_week_feat = [] # this also acts as a `attention` features
for day in range(PREV_DAY, PREV_DAY+1):
    for cnn_off in range(PREV_DAY_PRE_OFFSET, -1, -1):
        last_week_feat.append('demand@({0}-{1})'.format(day, cnn_off))
    for cnn_off in range(1, LABELS_OFFSET + PREV_DAY_POST_OFFSET + 1):
        last_week_feat.append('demand@({0}+{1})'.format(day, cnn_off))
X_train_last_week = train_df[last_week_feat]
X_eval_last_week = eval_df[last_week_feat]

In [30]:
prev_two_hours_feat = []
for day in range(1): # day = 0
    for rnn_off in range(RNN_OFFSET, -1, -1):
        prev_two_hours_feat.append('demand@({0}-{1})'.format(day, rnn_off))
        for poi_gh in POI_GEOHASH:
            prev_two_hours_feat.append('demand@({0}-{1}-{2})'.format(day, rnn_off, poi_gh))
X_train_two_hour = train_df[prev_two_hours_feat]
X_eval_two_hour = eval_df[prev_two_hours_feat]

In [31]:
time_independent_feat = ['hour', 'distance_from_qp09d8', 'distance_from_qp03xx', 'distance_from_qp03wf']
X_train_time_independent = train_df[time_independent_feat]
X_eval_time_independent = eval_df[time_independent_feat]

In [35]:
len(prev_six_days_feat) + len(last_week_feat)

147

In [48]:
six_day_inputs = Input(shape=(len(prev_six_days_feat),))
last_week_inputs = Input(shape=(len(last_week_feat),))
two_hour_inputs = Input(shape=(len(prev_two_hours_feat),))
time_independent_inputs = Input(shape=(len(time_independent_feat),))

daily_cnn = concatenate([six_day_inputs, last_week_inputs])
daily_cnn = Reshape((PREV_DAY, PREV_DAY_PRE_OFFSET + LABELS_OFFSET + PREV_DAY_POST_OFFSET + 1))(daily_cnn)
daily_cnn = Conv1D(16, 5, activation=relu, padding='same')(daily_cnn)
daily_cnn = MaxPool1D(2, 2, padding='same')(daily_cnn)
daily_cnn = Conv1D(16, 3, activation=relu, padding='same')(daily_cnn)
daily_cnn = Flatten()(daily_cnn)
daily_cnn = Model(inputs=[six_day_inputs, last_week_inputs], outputs=daily_cnn)

hourly_cnn = Reshape((RNN_OFFSET+1, len(POI_GEOHASH)+1))(two_hour_inputs)
hourly_cnn = Conv1D(8, 3, activation=relu, padding='same')(hourly_cnn)
hourly_cnn = MaxPool1D(2, 2, padding='same')(hourly_cnn)
hourly_cnn = Conv1D(16, 2, activation=relu, padding='same')(hourly_cnn)
hourly_cnn = Flatten()(hourly_cnn)
hourly_cnn = Model(inputs=two_hour_inputs, outputs=hourly_cnn)

stacked = concatenate([daily_cnn.output, hourly_cnn.output, last_week_inputs, time_independent_inputs])

stacked = Dense(256, activation=relu)(stacked)
stacked = Dense(128, activation=relu)(stacked)
stacked = Dense(32, activation=relu)(stacked)
stacked = Dense(5, activation=relu)(stacked)
stacked = Model(inputs=[last_week_inputs, six_day_inputs, two_hour_inputs, time_independent_inputs], outputs=stacked)

stacked.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           (None, 126)          0                                            
__________________________________________________________________________________________________
input_34 (InputLayer)           (None, 21)           0                                            
__________________________________________________________________________________________________
concatenate_12 (Concatenate)    (None, 147)          0           input_33[0][0]                   
                                                                 input_34[0][0]                   
__________________________________________________________________________________________________
input_35 (InputLayer)           (None, 48)           0                                            
__________

In [49]:
lr = 0.0005
ep = 1
stacked.compile(
    optimizer=Adagrad(lr=lr),
    loss=mean_squared_error,
    metrics=[root_mean_squared_error],
)

stacked.fit(
    [X_train_last_week, X_train_six_day, X_train_two_hour, X_train_time_independent],
    Y_train,
    batch_size=512,
    epochs=ep,
    validation_data=([X_eval_last_week, X_eval_six_day, X_eval_two_hour, X_eval_time_independent], Y_eval),
)

Train on 5478138 samples, validate on 1395450 samples
Epoch 1/1

KeyboardInterrupt: 

In [45]:
preds = stacked.predict([X_eval_last_week, X_eval_six_day, X_eval_two_hour, X_eval_time_independent])
count_rmse(preds, Y_eval)

demand@(0+1)    0.038151
demand@(0+2)    0.038989
demand@(0+3)    0.040197
demand@(0+4)    0.040909
demand@(0+5)    0.041560
dtype: float64

In [60]:
x = X_eval_two_hour['demand@(0-0)'].values
y1 = Y_eval['demand@(0+1)'].values
y2 = Y_eval['demand@(0+2)'].values
y3 = Y_eval['demand@(0+3)'].values
y4 = Y_eval['demand@(0+4)'].values
y5 = Y_eval['demand@(0+5)'].values

In [57]:
x.shape

(1395450,)

In [61]:
def custom_rmse(x,y):
    return np.sum(np.absolute(x - y))/x.shape[0]

In [62]:
custom_rmse(x, y1), custom_rmse(x, y2), custom_rmse(x, y3), custom_rmse(x, y4), custom_rmse(x, y5)

(0.01383062145355777,
 0.01687970719439127,
 0.01896614094473233,
 0.020823411228895264,
 0.02271628480319483)

In [21]:
preds[:20]

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]], dtype=float32)

In [22]:
Y_eval.iloc[:20]

Unnamed: 0,demand@(0+1),demand@(0+2),demand@(0+3),demand@(0+4),demand@(0+5)
4122,0.184645,0.206627,0.232386,0.255655,0.24796
4123,0.206627,0.232386,0.255655,0.24796,0.275323
4124,0.232386,0.255655,0.24796,0.275323,0.238671
4125,0.255655,0.24796,0.275323,0.238671,0.179991
4126,0.24796,0.275323,0.238671,0.179991,0.191984
4127,0.275323,0.238671,0.179991,0.191984,0.243441
4128,0.238671,0.179991,0.191984,0.243441,0.269081
4129,0.179991,0.191984,0.243441,0.269081,0.207029
4130,0.191984,0.243441,0.269081,0.207029,0.245731
4131,0.243441,0.269081,0.207029,0.245731,0.256975


In [None]:
# train_df = df[df.order <= 50*96]
# eval_df = df[df.order > 50*96]

In [None]:
cnn_inputs = Input(shape=(len(cnn_features),))
dnn_inputs = Input(shape=(len(dnn_features),))
rnn_inputs = Input(shape=(len(rnn_features),))

cnn = Reshape((PREV_DAY, CNN_OFFSET*2+1, ))(cnn_inputs)
'''depending on how you view your data, i prefer it to be (batch, steps, features)'''
cnn = Conv1D(16, 5, activation=relu, padding='same')(cnn)
cnn = MaxPool1D(2, 2, padding='same')(cnn)
cnn = Conv1D(8, 3, activation=relu, padding='same')(cnn)
cnn = MaxPool1D(2, 2, padding='same')(cnn)
cnn = Flatten()(cnn)
cnn = Model(inputs=cnn_inputs, outputs=cnn)

rnn2 = Reshape((PREV_DAY, CNN_OFFSET*2 + 1, ))(cnn_inputs)
rnn2 = GRU(32, return_sequences=True)(rnn2)
rnn2 = GRU(16)(rnn2)
rnn2 = Model(inputs=cnn_inputs, outputs=rnn2)

dnn = Dense(32, activation=relu)(dnn_inputs)
# dnn = Dropout(rate=0.2)(dnn)
dnn = Dense(16, activation=relu)(dnn)
dnn = Model(inputs=dnn_inputs, outputs=dnn)

rnn = Reshape((len(rnn_features), 1, ))(rnn_inputs) # i have only one feature to feed into RNN
rnn = GRU(32, return_sequences=True)(rnn)
rnn = GRU(16)(rnn)
rnn = Model(inputs=rnn_inputs, outputs=rnn)

stacked = concatenate([cnn.output, dnn.output, rnn.output, rnn2.output])

# stacked = BatchNormalization()(stacked)
stacked = Dense(64, activation=relu)(stacked)
# stacked = Dropout(rate=0.4)(stacked)
stacked = Dense(32, activation=relu)(stacked)
# stacked = Dropout(rate=0.3)(stacked)
stacked = Dense(16, activation=relu)(stacked)

'''Linear vs clipped relu vs sigmoid vs hard_sigmoid'''
# stacked = Dense(5, activation=linear)(stacked)
# this is nice, it is linear clipped, but i don't know how to make it work
# stacked = Dense(5, activation=clipped_relu)(stacked)
# stacked = Dense(5, activation=sigmoid)(stacked)
stacked = Dense(5, activation=hard_sigmoid)(stacked)

stacked = Model(inputs=[cnn.input, dnn.input, rnn.input], outputs=stacked)