In [None]:
# wrangling
import numpy as np
import pandas as pd
import feather
pd.options.display.max_columns = 200

# preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit

# model
from keras.optimizers import Adam
from keras.models import Sequential  
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import EarlyStopping
import tensorflow as tf
from keras import backend as K

# metrics
from sklearn.metrics import accuracy_score, log_loss

# others
import os
import random as rn
from datetime import datetime
from tqdm import tqdm_notebook as tqdm
from copy import deepcopy
import warnings
warnings.filterwarnings("ignore")

# load data

In [None]:
def load_rnn_data(path, window, predict_ts, isdim3=True, geo_col=["geoid10_tract"], y_cols=["crime"]):
    """
    y_cols: ["crime"] or ["incident_type_0", "incident_type_1", "incident_type_2"]
    geo_col: ["geoid10_tract"] or ["geoid10_block"]
    return y_all and x_all of given path
    """
    # load data
    df = feather.read_dataframe(path)
    df.sort_values(by=["datetime", "geoid10_tract"], inplace=True)
    df.set_index("datetime", inplace=True)

    # input columns
    x_cols = list(df.drop(y_cols + geo_col, axis=1).columns)

    # group by geoid
    geo_grs = df.groupby(by=geo_col)

    # arrayes to store x and y
    # (no of timesteps, window size, no of tracts,  no of features, )
    n_timesteps = int(len(df) / len(geo_grs)) - window - predict_ts + 1
    x_all = np.empty(shape=(n_timesteps, window, len(geo_grs), len(x_cols + y_cols)))

    # (output size, no of tracts, no of outputs)
    y_all = np.empty(shape=(n_timesteps, len(geo_grs), len(y_cols)))

    # to store geo_ids and y_all's datetime
    geo_ids = []

    y_datetime = df.index.unique()[window + predict_ts - 1:]

    for i, (geo_id, gr) in enumerate(tqdm(geo_grs)):
        geo_ids.append(geo_id)
        x_values = gr[y_cols + x_cols].values
        y_values = gr[y_cols].values

        for j in range(window, len(gr) - predict_ts + 1):
            # generate x_all
            x_all[j - window, :, i, :] = x_values[j - window:j, :]
            y_all[j - window, i, :] = y_values[j + predict_ts - 1, :]

    if isdim3:
        x_all = np.reshape(x_all,
                           newshape=(x_all.shape[0], x_all.shape[1], x_all.shape[2] * x_all.shape[3]))
        y_all = np.reshape(y_all,
                           newshape=(y_all.shape[0], y_all.shape[1] * y_all.shape[2]))

    return x_all, y_all, geo_ids, y_datetime

In [None]:
# set configuration
path = "./features/features_binary_tract_2H.feather"
window = 12
predict_ts = 1  # how many timesteps future does the model predict? 

In [None]:
# load x and y
x_all, y_all, geo_ids, y_datetime = load_rnn_data(path=path,
                                                  window=window,
                                                  predict_ts=predict_ts,
                                                  isdim3=True,
                                                  geo_col=["geoid10_tract"],
                                                  y_cols=["crime"])

In [None]:
print(x_all.shape)
print(y_all.shape)
print(len(geo_ids))  # to convert model output later
print(len(y_datetime))  # to convert model output later

# preprocessing

In [None]:
# # scaling
# scaler = MinMaxScaler()
# x_all = scaler.fit_transform(x_all)

# modeling

In [None]:
def time_series_cv(x_all, y_all, n_splits=5, model=None, fit_params=None, baseline=False):
    """
    :param baseline: True or False (defualt: False)
    :return: train and test scores and prediction of y on test data
    """

    # prepare dictionary to store scores
    train_scores = {}
    metrics = ["acc", "log_loss"]
    for metric in metrics:
        train_scores[metric] = []
    test_scores = deepcopy(train_scores)

    # prepare dictionary to store predictions
    y_test_probs = np.zeros_like(y_all)

    # time series split
    tss = TimeSeriesSplit(n_splits=n_splits)

    for split, (train_idx, test_idx) in enumerate(tss.split(x_all, y_all)):

        print("---------- split {0} ----------".format(split))
        print("[{0:%H:%M:%S}] train_index:{1}~{2} test_index:{3}~{4}".format(
            datetime.now(), train_idx[0], train_idx[-1], test_idx[0], test_idx[-1]))

        # create train and test set
        x_train = x_all[:train_idx[-1]]
        y_train = y_all[:train_idx[-1]]
        x_test = x_all[test_idx[0]:test_idx[-1]]
        y_test = y_all[test_idx[0]:test_idx[-1]]

        if baseline:
            # return 0 for all predicted probabiliby
            y_train_prob = np.zeros_like(y_train)
            y_test_prob = np.zeros_like(y_test)

            # return 0 for all binary predictions
            y_train_pred = np.zeros_like(y_train)
            y_test_pred = np.zeros_like(y_test)

        else:            
            # train
            model.fit(x_train, y_train, **fit_params)

            # predict
            y_train_prob = model.predict(x_train)
            y_test_prob = model.predict(x_test)

            # convert form probability to binary
            y_train_pred = np.fix(y_train_prob)
            y_test_pred = np.fix(y_test_prob)

        # store test prediction
        y_test_probs[test_idx[0]:test_idx[-1]] = y_test_prob

        # calculate metrics
        train_log_loss = log_loss(y_train.flatten(), y_train_prob.flatten())
        test_log_loss = log_loss(y_test.flatten(), y_test_prob.flatten())
        train_acc = accuracy_score(y_train.flatten(), y_train_pred.flatten())
        test_acc = accuracy_score(y_test.flatten(), y_test_pred.flatten())

        # store scores
        train_scores["log_loss"].append(train_log_loss)
        test_scores["log_loss"].append(test_log_loss)
        train_scores["acc"].append(train_acc)
        test_scores["acc"].append(test_acc)

        print("[{0:%H:%M:%S}] train_log_loss:{1} test_log_loss:{2}".format(
            datetime.now(), train_log_loss, test_log_loss))
        print("[{0:%H:%M:%S}] train_acc:{1} test_acc:{2}\n".format(
            datetime.now(), train_acc, test_acc))

        # convert to dataframe
        train_scores_df = pd.DataFrame(train_scores)
        test_scores_df = pd.DataFrame(test_scores)

    return train_scores_df, test_scores_df, y_test_probs

In [None]:
class MyLSTM():
    """
    class to input into function time_series_cv
    """
    def __init__(self, units=100,
                 dropout_rate=0.2, activation="sigmoid",
                 optimizer="adam", loss="binary_crossentropy"):
        """
        define LSTM model with given arguments
        """
        self.units = units
        self.dropout_rate = dropout_rate
        self.activation = activation
        self.optimizer = optimizer
        self.loss = loss

    def fit(self, x_train, y_train, epochs=200, early_stopping_patience=5,
            batch_size=128, validation_split=0.1, train_shuffle=False):
        """
        fit train data
        """
        self.model = Sequential()
        self.model.add(LSTM(units=self.units, input_shape=(x_train.shape[1], x_train.shape[2])))
        self.model.add(Dropout(self.dropout_rate))
        self.model.add(Dense(units=y_train.shape[1], activation=self.activation))
        self.model.compile(optimizer=self.optimizer, loss=self.loss, metrics=["acc"])

        earlystopping = EarlyStopping(monitor="val_loss", patience=early_stopping_patience)
        self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,
                       validation_split=validation_split, shuffle=train_shuffle,
                       callbacks=[earlystopping])
        
    def predict(self, x_test):
        """
        return predicted values of x_test
        """
        y_prob = self.model.predict(x_test)
        return y_prob

In [None]:
# parameters at model definition
def_params = {
    "units":100,
    "dropout_rate":0.159961594635,
    "activation":"sigmoid",
    "optimizer":Adam(lr=1.77547438185e-05),
    "loss":"binary_crossentropy"
}

# parameters at model training
fit_params = {
    "epochs":500,
    "early_stopping_patience":5,
    "batch_size":128,
    "validation_split":0.1,
    "train_shuffle":False
}

# number of time series cv splits
n_splits = 5

In [None]:
# set seed
np.random.seed(0)
os.environ["PYTHONHASHSEED"] = "0"
rn.seed(0)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1)
tf.set_random_seed(0)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

# define model
model = MyLSTM(**def_params)

# time series cross validation
train_scores_df, test_scores_df, y_test_probs = time_series_cv(x_all,
                                                               y_all,
                                                               n_splits=n_splits,
                                                               model=model,
                                                               fit_params=fit_params,
                                                               baseline=False)

# Result

In [None]:
# train metrics
train_scores_df.to_csv("./results/train_scores.csv", index=False)
train_scores_df

In [None]:
# test metrics
test_scores_df.to_csv("./results/test_scores.csv", index=False)
test_scores_df

In [None]:
# predicted probability of crime occurance
y_probs_df = pd.DataFrame(y_test_probs, index=y_datetime, columns=geo_ids)
y_probs_df = y_probs_df.loc[(y_probs_df != 0).any(axis=1), :]
y_probs_df.to_csv("./results/y_probs.csv", index=True)
y_probs_df.head()