In [27]:
import pandas as pd
import numpy as np
from sodapy import Socrata
import collections
import re
from time import time
import math
import feather
from collections import defaultdict
import geopandas
from datetime import datetime
import os
import glob
from tqdm import tqdm_notebook as tqdm
import optuna
import shutil
import tempfile
import tensorflow as tf

#Using keras
import keras
from keras.models import Sequential  
from keras.layers import Dense, LSTM, Dropout, TimeDistributed
from keras.callbacks import EarlyStopping

# others
from tqdm import tqdm

#from scripts.utils import load_rnn_data
pd.options.display.max_columns = 200
from scipy import stats
from collections import Counter

# validation
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score,mean_squared_error, log_loss
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from shapely.geometry import Point

from copy import deepcopy
# others
import warnings
warnings.filterwarnings("ignore")

In [3]:
def load_rnn_data(path, window, predict_ts, geo_col=["geoid10_tract"], y_cols=["crime"]):
    """
    y_cols: ["crime"] or ["incident_type_0", "incident_type_1", "incident_type_2"]
    geo_col: ["geoid10_tract"] or ["geoid10_block"]
    return y_all and x_all of given path
    """
    # load data
    df = feather.read_dataframe(path)
    df.sort_values(by=["datetime", "geoid10_tract"], inplace=True)
    df.set_index("datetime", inplace=True)

    # input columns
    x_cols = list(df.drop(y_cols + geo_col, axis=1).columns)

    # group by geoid
    geo_grs = df.groupby(by=geo_col)

    # arrayes to store x and y
    # (window size, input size, no of tracts, no of timesteps)
    n_timesteps = int(len(df) / len(geo_grs)) - window - predict_ts + 1
    x_all = np.empty(shape=(window, len(x_cols + y_cols), len(geo_grs), n_timesteps))

    # (output size, no of tracts, no of timesteps)
    y_all = np.empty(shape=(len(y_cols), len(geo_grs), n_timesteps))

    # to store geo_ids and y_all's datetime
    geo_ids = []

    y_datetime = df.index.unique()[window + predict_ts - 1:]

    for i, (geo_id, gr) in enumerate(tqdm(geo_grs)):
        geo_ids.append(geo_id)
        x_values = gr[y_cols + x_cols].values
        y_values = gr[y_cols].values

        for j in range(window, len(gr) - predict_ts + 1):
            # generate x_all
            x_all[:, :, i, j - window] = x_values[j - window:j, :]
            y_all[:, i, j - window] = y_values[j + predict_ts - 1, :]

    return x_all, y_all, geo_ids, y_datetime

In [4]:
# set configuration
path = "./features/features_binary_tract_2H.feather"
window = 12
predict_ts = 1

In [5]:
# load data as x and y of RNN
x_all, y_all, geo_ids, y_datetime  = load_rnn_data(path, window, predict_ts)

100%|██████████| 195/195 [00:21<00:00,  8.07it/s]


In [6]:
print(x_all.shape)
print(y_all.shape)
print(len(geo_ids))
print(len(y_datetime))

(12, 8, 195, 18600)
(1, 195, 18600)
195
18600


In [7]:
def time_series_cv(x_all, y_all, n_splits = 5, model=None, fit_params=None, baseline=False):
    """
    :param baseline: True or False (defualt: False)
    :return: train and test scores and prediction of y on test data
    """

    # prepare dictionary to store scores
    train_scores = {}
    metrics = ["acc", "log_loss"]
    for metric in metrics:
        train_scores[metric] = []
    test_scores = deepcopy(train_scores)

    # prepare dictionary to store predictions
    y_test_preds = np.empty_like(y_all)

    # time series split
    tss = TimeSeriesSplit(n_splits=n_splits)

    for split, (train_idx, test_idx) in enumerate(tss.split(x_all, y_all)):

        print("---------- split {0} ----------".format(split))
        print("train_index:{0}~{1} test_index:{2}~{3}".format(train_idx[0], train_idx[-1], test_idx[0], test_idx[-1]))

        # create train and test set
        x_train = x_all[:train_idx[-1]]
        y_train = y_all[:train_idx[-1]]
        x_test = x_all[test_idx[0]:test_idx[-1]]
        y_test = y_all[test_idx[0]:test_idx[-1]]

        if baseline:
            # return 0 for all predicted probabiliby
            y_train_prob = np.zeros_like(y_train)
            y_test_prob = np.zeros_like(y_test)

            # return 0 for all binary predictions
            y_train_pred = np.zeros_like(y_train)
            y_test_pred = np.zeros_like(y_test)

        else:
            # train
            model.fit(x_train, y_train, **fit_params)

            # predict
            y_train_prob = model.predict(x_train)
            y_test_prob = model.predict(x_test)

            # convert form probability to binary
            y_train_pred = np.fix(y_train_prob)
            y_test_pred = np.fix(y_test_prob)

        # store test prediction
        y_test_preds[test_idx[0]:test_idx[-1]] = y_test_pred

        # calculate metrics
        train_log_loss = log_loss(y_train.flatten(), y_train_prob.flatten())
        test_log_loss = log_loss(y_test.flatten(), y_test_prob.flatten())
        train_acc = accuracy_score(y_train.flatten(), y_train_pred.flatten())
        test_acc = accuracy_score(y_test.flatten(), y_test_pred.flatten())

        # store scores
        train_scores["log_loss"].append(train_log_loss)
        test_scores["log_loss"].append(test_log_loss)
        train_scores["acc"].append(train_acc)
        test_scores["acc"].append(test_acc)

        print("train_log_loss:{} test_log_loss:{}".format(train_log_loss, test_log_loss))
        print("train_acc:{} test_acc:{}\n".format(train_acc, test_acc))

        # convert to dataframe
        train_scores_df = pd.DataFrame(train_scores)
        test_scores_df = pd.DataFrame(test_scores)

    return train_scores_df, test_scores_df, y_test_preds

In [8]:
# transpose input shape to apply TimeSeriesSplit in time_series_cv
# TODO: fix output shape of load_rnn_data
if x_all.shape[0] != y_all.shape[0]:
    x_all = x_all.T
    y_all = y_all.T

In [9]:
x_all_tune = x_all[:500]
y_all_tune = y_all[:500]

In [33]:
# Model
# Param
dropout_rate = 0.2
optimizers = ['softmax', 'adam']
hidden_neurons = 50
epoch_param = 200
validation_split_param = 0.2
batch_size_param = 128    

print(x_all_tune.shape)


model = Sequential()
model.add(TimeDistributed(Dense(8), 
                          input_shape=(x_all_tune.shape[1], x_all_tune.shape[2], x_all_tune.shape[3]))
model.add(LSTM(units=hidden_neurons,
               input_shape=(x_all_tune.shape[1], x_all_tune.shape[2], x_all_tune.shape[3]),
               return_sequences=True))
model.add(Dropout(dropout_rate))
model.add(Dense(units=195, activation="sigmoid"))
model.compile(loss="binary_crossentropy", optimizer=optimizers, metrics=["acc"])

early_stop = EarlyStopping(monitor='loss',patience=10)

(500, 195, 8, 12)


ValueError: Input 0 is incompatible with layer lstm_32: expected ndim=3, found ndim=4

In [28]:
model = Sequential()
model.add(TimeDistributed(Dense(8), 
                          input_shape=(x_all_tune.shape[1], x_all_tune.shape[2], x_all_tune.shape[3])))
          
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_1 (TimeDist (None, 195, 8, 8)         104       
Total params: 104
Trainable params: 104
Non-trainable params: 0
_________________________________________________________________


In [29]:
## Set Up Optuna

def objective(trial):
    
    # Param
    dropout_rate = 0.2
    optimizers = ['softmax', 'adam']
    hidden_neurons = 50
    epoch_param = 200
    validation_split_param = 0.2
    batch_size_param = 128
    
    # Model
    model = Sequential()
    model.add(TimeDistributed(Dense(8), 
                          input_shape=(x_all_tune.shape[1], x_all_tune.shape[2], x_all_tune.shape[3])))
    model.add(LSTM(units=hidden_neurons))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=195, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer=optimizers, metrics=["acc"])
    
    early_stop = EarlyStopping(monitor='loss',patience=10)
    
    fit_params_def = {'batch_size': batch_size_param,
                     'epochs': epoch_param,
                      'verbose':1, 
                      'callbacks': [early_stop], 
                      'validation_split': validation_split_param}
    
    
    # Data
    # evaluate log loss and accuracy based on time series cross validation
    train_scores_df, test_scores_df, y_test_preds = time_series_cv(x_all_tune,
                                                               y_all_tune,
                                                               n_splits=5,
                                                               model=model,
                                                               fit_params=fit_params_def,
                                                               baseline=False)
    train_score_mean = train_scores_df.mean()['acc']
    test_scores_mean = test_scores_df.mean()['acc']

    return 1.0 - test_scores_mean

In [12]:
study = optuna.create_study()
study.optimize(objective, n_trials=20)

print(study.best_params)

[W 2019-05-02 15:01:13,535] Setting trial status as TrialState.FAIL because of the following error: ValueError('Input 0 is incompatible with layer lstm_1: expected ndim=3, found ndim=4',)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-b6bb4a3a8560>", line 16, in objective
    input_shape=(x_all_tune.shape[1], x_all_tune.shape[2], x_all_tune.shape[3]), ))
  File "//anaconda/lib/python2.7/site-packages/keras/engine/sequential.py", line 165, in add
    layer(x)
  File "//anaconda/lib/python2.7/site-packages/keras/layers/recurrent.py", line 532, in __call__
    return super(RNN, self).__call__(inputs, **kwargs)
  File "//anaconda/lib/python2.7/site-packages/keras/engine/base_layer.py", line 414, in __call__
    self.assert_input_compatibility(inputs)
  File "//anaconda/lib/python2.7/site-packages/keras/engine/base_layer.py", line 311, in assert_input_compatibility

ValueError: Input 0 is incompatible with layer lstm_8: expected ndim=3, found ndim=4
[W 2019-05-02 15:01:13,636] Setting trial status as TrialState.FAIL because of the following error: ValueError('Input 0 is incompatible with layer lstm_9: expected ndim=3, found ndim=4',)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-b6bb4a3a8560>", line 16, in objective
    input_shape=(x_all_tune.shape[1], x_all_tune.shape[2], x_all_tune.shape[3]), ))
  File "//anaconda/lib/python2.7/site-packages/keras/engine/sequential.py", line 165, in add
    layer(x)
  File "//anaconda/lib/python2.7/site-packages/keras/layers/recurrent.py", line 532, in __call__
    return super(RNN, self).__call__(inputs, **kwargs)
  File "//anaconda/lib/python2.7/site-packages/keras/engine/base_layer.py", line 414, in __call__
    self.assert_input_compatibility(inputs)
  File "//anaconda/lib/python2

ValueError: Input 0 is incompatible with layer lstm_16: expected ndim=3, found ndim=4
[W 2019-05-02 15:01:13,728] Setting trial status as TrialState.FAIL because of the following error: ValueError('Input 0 is incompatible with layer lstm_17: expected ndim=3, found ndim=4',)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-b6bb4a3a8560>", line 16, in objective
    input_shape=(x_all_tune.shape[1], x_all_tune.shape[2], x_all_tune.shape[3]), ))
  File "//anaconda/lib/python2.7/site-packages/keras/engine/sequential.py", line 165, in add
    layer(x)
  File "//anaconda/lib/python2.7/site-packages/keras/layers/recurrent.py", line 532, in __call__
    return super(RNN, self).__call__(inputs, **kwargs)
  File "//anaconda/lib/python2.7/site-packages/keras/engine/base_layer.py", line 414, in __call__
    self.assert_input_compatibility(inputs)
  File "//anaconda/lib/pytho

ValueError: No trials are completed yet.