# Load Data

In [1]:
import numpy as np
data = np.load(open('./TaxiBJ21.npy', 'rb'))  # the data file is in the same folder

In [2]:
data.shape

(4272, 2, 32, 32)

Here we simply use the historical data as the input features. External factors, e.g., weather, calendar info, are not used yet. 

In [3]:
# history_length is timestep of the historical window size
# predict_length is the prediction horizon
def generate_data(data, history_length, predict_length):
    history_data = []
    predict_data = []
    total_length = data.shape[0]
    for end_idx in range(history_length + predict_length, total_length):
        predict_frames = data[end_idx-predict_length:end_idx]
        history_frames = data[end_idx-predict_length-history_length:end_idx-predict_length]
        history_data.append(history_frames)
        predict_data.append(predict_frames)
    history_data = np.stack(history_data)
    predict_data = np.stack(predict_data)
    return history_data, predict_data

In [4]:
history_length = 4 * 2  # use the last 4 hours data
predict_length = 1  # predict the next 30 minutes
history_data, predict_data = generate_data(data, history_length, predict_length)
print(history_data.shape)
print(predict_data.shape)

(4263, 8, 2, 32, 32)
(4263, 1, 2, 32, 32)


Split the data with train:valid:test = 8:1:1. Simply, both the validation and testing sets have a data range of 9 days. The other data are used as training set.

In [5]:
history_data_train = history_data[:-18 * 48]
predict_data_train = predict_data[:-18 * 48]

history_data_valid = history_data[-18 * 48:-9 * 48]
predict_data_valid = predict_data[-18 * 48:-9 * 48]

history_data_test = history_data[-9 * 48:]
predict_data_test = predict_data[-9 * 48:]

# A Baseline MLP model

Reshape the data format:

In [6]:
history_data_train = history_data_train.reshape(history_data_train.shape[0], -1)
history_data_train.shape

(3399, 16384)

In [7]:
predict_data_train = predict_data_train.reshape(predict_data_train.shape[0], -1)
predict_data_train.shape

(3399, 2048)

In [8]:
history_data_valid = history_data_valid.reshape(history_data_valid.shape[0], -1)
history_data_valid.shape

(432, 16384)

In [9]:
predict_data_valid = predict_data_valid.reshape(predict_data_valid.shape[0], -1)
predict_data_valid.shape

(432, 2048)

In [10]:
history_data_test = history_data_test.reshape(history_data_test.shape[0], -1)
history_data_test.shape

(432, 16384)

In [11]:
predict_data_test = predict_data_test.reshape(predict_data_test.shape[0], -1)
predict_data_test.shape

(432, 2048)

Build the model:

In [12]:
import tensorflow as tf


model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(512, activation='relu', input_dim=history_data_train.shape[-1]),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(predict_data_train.shape[-1], activation='sigmoid')
])

In [13]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               8389120   
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_3 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_4 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_5 (Dense)              (None, 2048)              1050624   
Total params: 10,490,368
Trainable params: 10,490,368
Non-trainable params: 0
____________________________________________

In [14]:
model.compile(loss="mse", optimizer="adam")

In [15]:
model.fit(
    history_data_train,
    predict_data_train,
    batch_size=10,
    epochs=100,
    validation_data = (history_data_valid, predict_data_valid),
    # callbacks=tf.keras.callbacks.EarlyStopping(patience=10, monitor='val_loss'),
    verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x15b8394a6a0>

In [16]:
predict_results = model.predict(history_data_test)

In [17]:
predict_results.shape

(432, 2048)

In [18]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

Evalute RMSE:

In [19]:
np.sqrt(mean_squared_error(predict_results.flatten(), predict_data_test.flatten()))

0.02654992471867174

Evaluate MAE:

In [20]:
mean_absolute_error(predict_results.flatten(), predict_data_test.flatten())

0.011686256842014228