In [1]:
import pandas as pd
import numpy as np
import re
from time import time
import math
import feather
from datetime import datetime
import os
import glob
from tqdm import tqdm_notebook as tqdm
import optuna
from optuna.integration import KerasPruningCallback
import tempfile
import tensorflow as tf
import random as rn

#Using keras
import keras
from keras import backend as K
from keras.models import Sequential  
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import EarlyStopping

#from scripts.utils import load_rnn_data
pd.options.display.max_columns = 200

# validation
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score,mean_squared_error, log_loss
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from copy import deepcopy

# others
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


## Load Data

In [2]:
def load_rnn_data(path, window, predict_ts, isdim3=True, geo_col=["geoid10_tract"], y_cols=["crime"]):
    """
    y_cols: ["crime"] or ["incident_type_0", "incident_type_1", "incident_type_2"]
    geo_col: ["geoid10_tract"] or ["geoid10_block"]
    return y_all and x_all of given path
    """
    # load data
    df = feather.read_dataframe(path)
    df.sort_values(by=["datetime", "geoid10_tract"], inplace=True)
    df.set_index("datetime", inplace=True)

    # input columns
    x_cols = list(df.drop(y_cols + geo_col, axis=1).columns)

    # group by geoid
    geo_grs = df.groupby(by=geo_col)

    # arrayes to store x and y
    # (no of timesteps, window size, no of tracts,  no of features, )
    n_timesteps = int(len(df) / len(geo_grs)) - window - predict_ts + 1
    x_all = np.empty(shape=(n_timesteps, window, len(geo_grs), len(x_cols + y_cols)))

    # (output size, no of tracts, no of outputs)
    y_all = np.empty(shape=(n_timesteps, len(geo_grs), len(y_cols)))

    # to store geo_ids and y_all's datetime
    geo_ids = []

    y_datetime = df.index.unique()[window + predict_ts - 1:]

    for i, (geo_id, gr) in enumerate(tqdm(geo_grs)):
        geo_ids.append(geo_id)
        x_values = gr[y_cols + x_cols].values
        y_values = gr[y_cols].values

        for j in range(window, len(gr) - predict_ts + 1):
            # generate x_all
            x_all[j - window, :, i, :] = x_values[j - window:j, :]
            y_all[j - window, i, :] = y_values[j + predict_ts - 1, :]

    if isdim3:
        x_all = np.reshape(x_all,
                           newshape=(x_all.shape[0], x_all.shape[1], x_all.shape[2] * x_all.shape[3]))
        y_all = np.reshape(y_all,
                           newshape=(y_all.shape[0], y_all.shape[1] * y_all.shape[2]))

    return x_all, y_all, geo_ids, y_datetime

In [3]:
# set configuration
path = "./features/features_binary_tract_2H.feather"
window = 12
predict_ts = 1  # how many timesteps future does the model predict? 

In [4]:
# load data as x and y of RNN
x_all, y_all, geo_ids, y_datetime = load_rnn_data(path=path,
                                                  window=window,
                                                  predict_ts=predict_ts,
                                                  isdim3=True,
                                                  geo_col=["geoid10_tract"],
                                                  y_cols=["crime"])

HBox(children=(IntProgress(value=0, max=195), HTML(value=u'')))




In [5]:
print(x_all.shape)
print(y_all.shape)
print(len(geo_ids))  # to convert model output later
print(len(y_datetime))  # to convert model output later

(18600, 12, 1560)
(18600, 195)
195
18600


## Modeling
see https://github.com/pfnet/optuna/blob/master/examples/pruning/keras_integration.py

In [6]:
BATCHSIZE = 128
EPOCHS = 20

In [7]:
# set seed
np.random.seed(0)
os.environ["PYTHONHASHSEED"] = "0"
rn.seed(0)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1)
tf.set_random_seed(0)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

In [8]:
def create_model(trial):
    # We optimize the number of layers, hidden units and dropout in each layer and
    # the learning rate of RMSProp optimizer.
    
    
    # We define our MLP.
    n_layers = trial.suggest_int('n_layers', 1, 3)
    model = Sequential()
    model.add(LSTM(units=100, input_shape=(x_train.shape[1], x_train.shape[2])))
    #Hitting the first error above - how do we get the time series code to feed an example of 
    for i in range(n_layers):
        num_hidden = int(trial.suggest_loguniform('n_units_l{}'.format(i), 4, BATCHSIZE))
        model.add(Dense(num_hidden, activation='sigmoid'))
        dropout = trial.suggest_uniform('dropout_l{}'.format(i), 0.2, 0.5)
        model.add(Dropout(rate=dropout))
        model.add(Dense(units=y_train.shape[1], activation='softmax'))

    # We compile our model with a sampled learning rate.
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    model.compile(loss='categorical_crossentropy',
                  optimizer=keras.optimizers.RMSprop(lr=lr),
                  metrics=['accuracy'])

    return model

In [9]:
## Need to define this to produce a training and test sample
def splitting(data):
    # create train and test set
    split = int(len(data) * 0.8)    
    x_train = data[:split]
    y_train = data[:split]
    x_test = data[split:]
    y_test = data[split:]
    return x_train, y_train, x_test, y_test

In [10]:
#split = int(len(x_all) * 0.8)    
#x_train = x_all[:split]
#y_train = x_all[:split]
#x_test = x_all[split:]
#y_test = x_all[split:]

In [11]:
def objective(trial):
    # Clear clutter form previous session graphs.
    keras.backend.clear_session()
    
    # time series split
    x_train, y_train, x_test, y_test = splitting(x_all) 
    # Generate our trial model.
    model = create_model(trial)

    # Fit the model on the training data.
    # The KerasPruningCallback checks for pruning condition every epoch.
    model.fit(x_train,
              y_train,
              batch_size=BATCHSIZE,
              callbacks=[KerasPruningCallback(trial, 'val_acc')],
              epochs=EPOCHS,
              validation_split=0.2,
              verbose=1)
    # Evaluate the model accuracy on the test set.
    score = model.evaluate(x_test, y_test, verbose=0)
    return score[1]


In [12]:
study = optuna.create_study(pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=100)
pruned_trials = [t for t in study.trials if t.state == optuna.structs.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.structs.TrialState.COMPLETE]
print('Study statistics: ')
print('  Number of finished trials: ', len(study.trials))
print('  Number of pruned trials: ', len(pruned_trials))
print('  Number of complete trials: ', len(complete_trials))

print('Best trial:')
trial = study.best_trial

print('  Value: ', trial.value)

print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

[W 2019-05-04 17:49:13,433] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective
    model = create_model(trial)
  File "<ipython-input-8-cfac38deb271>", line 9, in create_model
    model.add(LSTM(units=100, input_shape=(x_train.shape[1], x_train.shape[2])))
NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:13,450] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective
    model = create_model(trial)
  File "<ipython

NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:13,701] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective
    model = create_model(trial)
  File "<ipython-input-8-cfac38deb271>", line 9, in create_model
    model.add(LSTM(units=100, input_shape=(x_train.shape[1], x_train.shape[2])))
NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:13,718] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective


NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:13,984] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective
    model = create_model(trial)
  File "<ipython-input-8-cfac38deb271>", line 9, in create_model
    model.add(LSTM(units=100, input_shape=(x_train.shape[1], x_train.shape[2])))
NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:14,012] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective


NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:14,247] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective
    model = create_model(trial)
  File "<ipython-input-8-cfac38deb271>", line 9, in create_model
    model.add(LSTM(units=100, input_shape=(x_train.shape[1], x_train.shape[2])))
NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:14,265] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective


NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:14,591] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective
    model = create_model(trial)
  File "<ipython-input-8-cfac38deb271>", line 9, in create_model
    model.add(LSTM(units=100, input_shape=(x_train.shape[1], x_train.shape[2])))
NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:14,611] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective


NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:14,966] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective
    model = create_model(trial)
  File "<ipython-input-8-cfac38deb271>", line 9, in create_model
    model.add(LSTM(units=100, input_shape=(x_train.shape[1], x_train.shape[2])))
NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:14,995] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective


NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:15,323] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective
    model = create_model(trial)
  File "<ipython-input-8-cfac38deb271>", line 9, in create_model
    model.add(LSTM(units=100, input_shape=(x_train.shape[1], x_train.shape[2])))
NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:15,362] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective


NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:15,734] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective
    model = create_model(trial)
  File "<ipython-input-8-cfac38deb271>", line 9, in create_model
    model.add(LSTM(units=100, input_shape=(x_train.shape[1], x_train.shape[2])))
NameError: global name 'x_train' is not defined
[W 2019-05-04 17:49:15,760] Setting trial status as TrialState.FAIL because of the following error: NameError("global name 'x_train' is not defined",)
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/optuna/study.py", line 407, in _run_trial
    result = func(trial)
  File "<ipython-input-11-e2981d556b94>", line 8, in objective


Study statistics: 
('  Number of finished trials: ', 100)
('  Number of pruned trials: ', 0)
('  Number of complete trials: ', 0)
Best trial:


ValueError: No trials are completed yet.