# LSTM Model with Community Clustered Data
Reservations are aggregated by grid cells (200m x 200m) and clustered into communities. 
Reservation data including weather data.

Inspired by: https://github.com/HAKO411/Deep-Learning-for-Short-term-bike-sharing-demand-prediction

In [None]:
# imports
import numpy as np
import pandas as pd
import tensorflow as tf

## Data Preparation
Preparing data to train the cnn-lstm model

In [None]:
# variables
training_data_filepath = '../../pickles/reservations_training.pickle'
validation_data_filepath = '../../pickles/reservations_validation.pickle'

results_filepath = '../../pickles/results/lstm_cc_weather_results.pickle'

In [None]:
# training variables
num_units = 50
batch_size = 32
epochs = 20
learning_rate = 0.001
loss_function = 'mean_squared_error'
seed = 42
dropout = 0.1

num_input_features = 5

In [None]:
# load data
training_input_data = pd.read_pickle(training_data_filepath)
training_input_data.set_index('startTime', inplace=True)
training_input_data.index = pd.to_datetime(training_input_data.index)
training_input_data.head()

In [None]:
# load data
validation_input_data = pd.read_pickle(validation_data_filepath)
validation_input_data.set_index('startTime', inplace=True)
validation_input_data.index = pd.to_datetime(validation_input_data.index)
validation_input_data.head()

In [None]:
# transform data
training_demand_data = training_input_data.resample('h').small_grid_id.value_counts().unstack().fillna(0)
training_weather_data = training_input_data.resample('h').mean().drop(columns=['large_grid_id', 'small_grid_id', 'community_small_grid_id', 'community_voronoi_grid_id', 'voronoi_grid_id', 'endLat', 'endLon', 'startLat', 'startLon', 'endTime'])
training_data = pd.merge(training_weather_data, training_demand_data, left_index=True, right_index=True, how='inner')
training_data.set_index(pd.to_datetime(training_data.index), inplace=True)
training_data.sort_index(inplace=True)
training_data.head()

In [None]:
# transform data
validation_demand_data = validation_input_data.resample('h').small_grid_id.value_counts().unstack().fillna(0)
validation_weather_data = validation_input_data.resample('h').mean().drop(columns=['large_grid_id', 'small_grid_id', 'community_small_grid_id', 'community_voronoi_grid_id', 'voronoi_grid_id', 'endLat', 'endLon', 'startLat', 'startLon', 'endTime'])
validation_data = pd.merge(validation_weather_data, validation_demand_data, left_index=True, right_index=True, how='inner')
validation_data.set_index(pd.to_datetime(validation_data.index), inplace=True)
validation_data.sort_index(inplace=True)
validation_data.head()

In [None]:
# decompose timestamp
training_data['hour'] = training_data.index.hour
training_data['day'] = training_data.index.dayofweek
training_data['month'] = training_data.index.month
training_data.columns = training_data.columns.astype(str)
training_data = training_data.reindex(sorted(training_data.columns, reverse=True), axis=1)
training_data.head()

In [None]:
# decompose timestamp
validation_data['hour'] = validation_data.index.hour
validation_data['day'] = validation_data.index.dayofweek
validation_data['month'] = validation_data.index.month
validation_data.columns = validation_data.columns.astype(str)
validation_data = validation_data.reindex(sorted(validation_data.columns, reverse=True), axis=1)
validation_data.head()

In [None]:
# index is timestamp (hourly)
# columns are the grid cells
# values are the demand e.g. pickup in the timeframe
full_demand = training_data
full_demand.head()

In [None]:
# create list of grid cells for each community
community_lists = training_input_data.groupby('community_small_grid_id')['small_grid_id'].apply(lambda x: list(set(x)))

# Convert the result to a dictionary for easier access
community_dict = community_lists.to_dict()
community_dict

In [None]:
# get values, grid-cell name and drop null values
def get_value_name(all_cells_demand, cells):
    station_value = all_cells_demand[['month', 'day', 'hour', 'temperature', 'precipitation'] + cells]
    return station_value

In [None]:
def prepare_data_for_model(demand_at_cell):
    x = demand_at_cell.iloc[:, 0:num_input_features].values
    y = demand_at_cell.iloc[:, num_input_features:].values

    return x.reshape((x.shape[0], 1, x.shape[1])), y

In [None]:
def lstm_model(name, units, training_x, validation_x, training_y, validation_y, num_output_features):
    model = tf.keras.models.Sequential(name=name)

    model.add(tf.keras.layers.Input(shape=(1, num_input_features)))

    model.add(tf.keras.layers.LSTM(units, return_sequences=True))
    model.add(tf.keras.layers.Dropout(dropout, seed=seed))

    model.add(tf.keras.layers.LSTM(units, return_sequences=True))
    model.add(tf.keras.layers.Dropout(dropout, seed=seed))

    model.add(tf.keras.layers.LSTM(units, return_sequences=True))
    model.add(tf.keras.layers.Dropout(dropout, seed=seed))

    model.add(tf.keras.layers.LSTM(units, return_sequences=True))
    model.add(tf.keras.layers.Dropout(dropout, seed=seed))

    model.add(tf.keras.layers.LSTM(units, return_sequences=False))
    model.add(tf.keras.layers.Dropout(dropout, seed=seed))

    model.add(tf.keras.layers.Dense(num_output_features))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss=loss_function)

    model.fit(training_x, training_y, batch_size=batch_size, epochs=epochs, validation_data=(validation_x, validation_y), verbose=0)
    return model

In [None]:
# Input data
all_grid_cells_temp = full_demand
val_data_temp = validation_data.copy()
# run the model
community_clusters = list()
cluster_grid_cells = list()
models = list()
# loop through all the grid cells
for i in range(len(community_dict)):
    
    grid_cells = community_dict[i]
    
    # preprocessing
    grid_values = get_value_name(all_grid_cells_temp, list(map(str, grid_cells)))
    val_grid_values = get_value_name(val_data_temp, list(map(str, grid_cells)))

    train_x, train_y = prepare_data_for_model(grid_values)
    val_x, val_y = prepare_data_for_model(val_grid_values)
    
    model_name = 'LSTM_CC_weather_' + str(i)
    print('Training model ' + model_name)
    
    # LSTM modelling & forecast
    current_model = lstm_model(model_name, num_units, train_x, val_x, train_y, val_y, len(grid_cells))

    #Save result
    community_clusters.append(i)
    models.append(current_model)
    cluster_grid_cells.append(grid_cells)

results = pd.DataFrame({'community_clusters': community_clusters, 'cluster_grid_cells': cluster_grid_cells, 'model': models})

results.head()

In [None]:
# save results
results.to_pickle(results_filepath)