In [1]:
!pip install --upgrade pip
!pip install python-decouple
!pip install geoalchemy2
!pip install shapely
!pip install scipy
!pip install hyperas

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (19.1.1)


In [2]:
from sqlalchemy import create_engine, func, text
from sqlalchemy.orm import sessionmaker
from decouple import config
from shapely import wkb, wkt
from shapely.geometry import Point
from geoalchemy2.shape import to_shape 

import pandas as pd
import numpy as np
import random
import json
from datetime import datetime, timedelta
import re
from matplotlib import pyplot as plt

from sklearn.preprocessing import RobustScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import GridSearchCV

from hyperas.distributions import uniform



# ----------- TODO: Issues with importing imbalance-learn library
# from imblearn.over_sampling import RandomOverSampler
# from imblearn.under_sampling import NearMiss
# from imblearn.under_sampling import (RandomUnderSampler,
#                                      ClusterCentroids,
#                                      TomekLinks,
#                                      NeighbourhoodCleaningRule,
#                                      NearMiss)

Using TensorFlow backend.


In [3]:
"""Contains models for DB."""

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, BigInteger, Integer, String, DateTime, ForeignKey, Float
from sqlalchemy.orm import relationship
from geoalchemy2 import Geometry


BASE = declarative_base()


class City(BASE):
    """City model for DB. Has information of cities."""
    __tablename__ = 'city'
    id            = Column(BigInteger, primary_key=True)
    city          = Column(String, unique=False, nullable=False)
    state         = Column(String, unique=False, nullable=True)
    country       = Column(String, unique=False, nullable=False)
    location      = Column(Geometry(geometry_type='POINT'), nullable=False)
    blocks        = relationship("Blocks", back_populates="city")
    zipcodes      = relationship("ZipcodeGeom", back_populates="city")
    incidents     = relationship("Incident", back_populates="city")


class Blocks(BASE):
    """Block model for DB. Has information of city blocks for a related city
        id."""
    __tablename__ = 'block'
    id            = Column(BigInteger, primary_key=True)
    cityid        = Column(BigInteger, ForeignKey('city.id'), nullable=False)
    shape         = Column(Geometry(geometry_type='MULTIPOLYGON'), nullable=False)
    population    = Column(Integer, nullable=False)
    city          = relationship("City", back_populates="blocks")
    incidents     = relationship("Incident", back_populates="block")

class ZipcodeGeom(BASE):
    """Zipcode geometry model for DB. Has information of zipcodes and related
        city id."""
    __tablename__ = 'zipcodegeom'
    id            = Column(BigInteger, primary_key=True)
    cityid        = Column(BigInteger, ForeignKey('city.id'), nullable=False)
    zipcode       = Column(String, nullable=False, unique=True)
    shape         = Column(Geometry(geometry_type='MULTIPOLYGON'), nullable=False)
    city          = relationship("City", back_populates="zipcodes")

class Incident(BASE):
    """Incident model for DB. Has information of a specific crime, including
        where it took place, when it took place, and the type of crime that
        occurred."""
    __tablename__ = 'incident'
    id            = Column(BigInteger, primary_key=True)
    crimetypeid   = Column(BigInteger, ForeignKey('crimetype.id'), nullable=False)
    locdescid     = Column(BigInteger, ForeignKey('locdesctype.id'), nullable=False)
    cityid        = Column(BigInteger, ForeignKey('city.id'), nullable=False)
    blockid       = Column(BigInteger, ForeignKey('block.id'), nullable=False)
    location      = Column(Geometry(geometry_type='POINT'), nullable=False)
    datetime      = Column(DateTime, nullable=False)
    hour          = Column(Integer, nullable=False)
    dow           = Column(Integer, nullable=False)
    month         = Column(Integer, nullable=False)
    year          = Column(Integer, nullable=False)
    city          = relationship("City", back_populates="incidents")
    block         = relationship("Blocks", back_populates="incidents")
    crimetype     = relationship("CrimeType", back_populates="incidents")
    locationdesc  = relationship("LocationDescriptionType", back_populates="incidents")

class CrimeType(BASE):
    """CrimeType model for DB. Has information of the types of crime, including
        a general description and the numerical severity of the crime."""
    __tablename__ = 'crimetype'
    id            = Column(BigInteger, primary_key=True)
    category      = Column(String, unique=True, nullable=False)
    severity      = Column(Integer, nullable=False)
    incidents     = relationship("Incident", back_populates="crimetype")


class LocationDescriptionType(BASE):
    """Location description model for DB. Has information on the type of
        location that the crime took place."""
    __tablename__ = 'locdesctype'
    id            = Column(BigInteger, primary_key=True)
    key1          = Column(String, nullable=False)
    key2          = Column(String, nullable=False)
    key3          = Column(String, nullable=False)
    incidents     = relationship("Incident", back_populates="locationdesc")

In [4]:
class GetData(object):
    def go(self, SESSION, start_year, end_year):
        SQL_QUERY = \
            f'''
                WITH
                    max_severity AS (
                        SELECT MAX(severity) AS severity
                        FROM (
                            SELECT SUM(crimetype.severity)/AVG(block.population) AS severity
                            FROM incident
                            INNER JOIN block ON incident.blockid = block.id INNER JOIN crimetype ON incident.crimetypeid = crimetype.id AND block.population > 0
                            GROUP BY
                                incident.blockid,
                                incident.year,
                                incident.month,
                                incident.dow,
                                incident.hour
                        ) AS categories
                    ),
                    block_incidents AS (
                        SELECT
                            incident.blockid,
                            incident.year,
                            incident.month,
                            incident.dow,
                            incident.hour,
                            SUM(crimetype.severity)/AVG(block.population) AS severity
                        FROM incident
                        INNER JOIN block ON incident.blockid = block.id
                        INNER JOIN crimetype ON incident.crimetypeid = crimetype.id
                            AND block.population > 0
                            AND incident.cityid = 1
                            AND incident.year >= {start_year}
                            AND incident.year <= {end_year}
                        GROUP BY
                            incident.blockid,
                            incident.year,
                            incident.month,
                            incident.dow,
                            incident.hour
                    )
                SELECT
                    block_incidents.blockid,
                    block_incidents.year,
                    block_incidents.month,
                    block_incidents.dow,
                    block_incidents.hour,
                    block_incidents.severity/max_severity.severity AS severity
                FROM block_incidents, max_severity        
            '''
        return SESSION.execute(text(SQL_QUERY)).fetchall()

In [5]:
def process_data(data, start_year, end_year, blockid_dict):

    X = np.zeros((len(blockid_dict), 24, 7*24+1))
    y = np.zeros((len(blockid_dict), 12, 7*24))
    
    # records is the list of rows we get from the query with this order:
    #   blockid, year, month, dow, hour, risk
    #   month is from 1 - 12

    for r in data:
        if r[0] in blockid_dict:
            if r[1] == end_year:
                # index into array  0-based month
                # vvvvvvvvvvvvvvvv    vvvvvv
                y[blockid_dict[r[0]], r[2]-1, 24*r[3]+r[4]] = float(r[5])
                #                             ^^^^^^^^^^^^^   ^^^^
                #                             hours since     risk
                #                             beginning of
                #                             week
            else:
                # month, year = get_month_year(datetime)
                # index into array    year 0.....1   month   
                # vvvvvvvvvvvvvvvv    vvvvvvvvvvvvv  vvvvvv
                X[blockid_dict[r[0]], 12*(r[1]-start_year)+r[2]-1, 24*r[3]+r[4]] = float(r[5])
                #                                                  ^^^^^^^^^^^^^   ^^^^
                #                                                  hours since     risk
                #                                                  beginning of
                #                                                  week
    
    for i in range(24):
        X[:, i, -1] = (start_year*12+i) / (2000 * 12)
        
    #     for i in range(12):
    #         y[:, i, -1] = start_year*12+i
    
    #     for i in range(0, NUM_BLOCKIDS):
    #         X[i, :, -1] = blockid_dict.get(i+1, 0)
    #         y[i, :, -1] = blockid_dict.get(i+1, 0)
    
    return X, y

In [6]:
from contextlib import contextmanager

@contextmanager
def session_scope():
    """Provide a transactional scope around a series of operations."""

    DB_URI  = config('DB_URI')
    ENGINE  = create_engine(DB_URI)
    Session = sessionmaker(bind=ENGINE)
    SESSION = Session()
    
    try:
        yield SESSION
        SESSION.commit()
    except:
        SESSION.rollback()
        raise
    finally:
        SESSION.close()


def ready_data(training_start_year, training_end_year, train_blockid_dict,
               testing_start_year, testing_end_year, test_blockid_dict):
    with session_scope() as session:
        training_data = GetData().go(session,
                                     training_start_year,
                                     training_end_year)
        testing_data = GetData().go(session,
                                     testing_start_year,
                                     testing_end_year)
        
        X_train, y_train = process_data(training_data,
                                        training_start_year, 
                                        training_end_year,
                                        train_blockid_dict)
        X_test, y_test = process_data(testing_data,
                                      testing_start_year, 
                                      testing_end_year,
                                      test_blockid_dict)

    return X_train, X_test, y_train, y_test

In [7]:
%%time

# start month = 3, end_month = 2 (months are 0-indexed)
#   X: 4/2017 -> 3/2019 actual date
#   y: 4/2019 -> 3/2020 actual date
#
X_test_start_month = 0
X_test_end_month   = 0
X_test_start_year  = 2016
X_test_end_year    = 2018

TRAIN_NUM_BLOCKIDS = TEST_NUM_BLOCKIDS = 800

TRAIN_BLOCKIDS = random.sample(list(range(1,802)), k=TRAIN_NUM_BLOCKIDS)   
train_blockid_dict = {}
for ind, blockid in enumerate(TRAIN_BLOCKIDS ):
    train_blockid_dict[blockid] = ind
        
TEST_BLOCKIDS = random.sample(list(range(1,802)), k=TEST_NUM_BLOCKIDS)    
test_blockid_dict = {}
for ind, blockid in enumerate(TEST_BLOCKIDS ):
    test_blockid_dict[blockid] = ind

X_train, X_test, y_train, y_test = ready_data(2015, 2017, train_blockid_dict,
                                              X_test_start_year, X_test_end_year, test_blockid_dict)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

  """)


(800, 24, 169) (800, 12, 168) (800, 24, 169) (800, 12, 168)
CPU times: user 6.89 s, sys: 676 ms, total: 7.56 s
Wall time: 1min 22s


In [8]:
def plot_output(y, y_pred, dataset_type):

    fig = plt.figure(figsize=(10, 8))
    plt.plot(np.arange(len(y.flatten())),
                 y.flatten(), color='blue');
    plt.plot(np.arange(len(y_pred.flatten())),
                 y_pred.flatten(), color='red');
    plt.xlabel('Hour since beginning of data', fontsize=16)
    plt.ylabel('Risk', fontsize=18)
    plt.title(dataset_type + ' dataset', fontsize=18)
    plt.legend(labels=['risk', 'predicted risk'], prop={'size': 20})
    plt.show()

## LSTM Predictions

In [9]:
def plot_training_validation_loss(epochs, history):
    plt.plot(range(1, epochs), history.history['loss'])
    plt.plot(range(1, epochs), history.history['val_loss'])
    plt.title('model train vs validation loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper right')
    plt.show()

In [10]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Reshape, Dropout, LeakyReLU, Bidirectional
from keras.optimizers import Adam, SGD, RMSprop, Adagrad, Adadelta, Adamax, Nadam
import keras.backend as K
from keras.wrappers.scikit_learn import KerasRegressor

random.seed(101)
best_model = None
best_mse   = None

def perc_error(y_true, y_pred):
    return 100.0 * K.mean((y_true - y_pred) / y_true)

def create_model(learn_rate=0.01, 
                 momentum=0,
                 opt_name='Adam',
                 init_mode='uniform',
                 activation='leakyrelu'
                ):
    data_dim    = 7*24+1   # All values in each hour of the week
                           # averaged over each day for all weeks
                           # of the month
    timesteps   = 2 * 12   # Summed per month
    batch_size  = 64
    num_outputs = (7*24+1) * 12
    
    if activation == 'leakyrelu':
        activation = LeakyReLU()
    
    # expected input data shape: (batch_size, timesteps, data_dim)
    model = Sequential()

    model.add(Bidirectional(LSTM(2000, input_shape=(24,169), 
                   activation='relu', recurrent_activation='relu', 
                   dropout=0.2), merge_mode='ave'))
    model.add(Dense(12*168, kernel_initializer=init_mode, activation='relu'))
    model.add(Reshape((12,168)))
    
    optimizer = { 
        'SGD':      SGD(lr=learn_rate, momentum=momentum),
        'RMSprop':  RMSprop(lr=learn_rate),
        'Adagrad':  Adagrad(lr=learn_rate),
        'Adadelta': Adadelta(lr=learn_rate),
        'Adam':     Adam(lr=learn_rate),
        'Adamax':   Adamax(lr=learn_rate),
        'Nadam':    Nadam(lr=learn_rate),
    }

    def sqrt_loss(y_pred, y_true):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))
    
    model.compile(loss='mean_squared_error', metrics=[sqrt_loss], optimizer=optimizer[opt_name])

    return model

def my_gridsearch(X_train, y_train, X_test, y_test):
    
    # GridSearchCV's fit method requires X which is 2D, y which is 1D.
    # This is a problem for us since our X and y are 3D.
    # Instead of GridSearchCV, we will create our own loop to
    # search through the grid.
    
    global best_model, best_mse
    
    # 
    #     for epochs in range(5, 6, 5):
    #         for lr in [0.00001, 0.0001, 0.001, 0.01, 0.1]:
    #             for opt_name in ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']:
    #                 for init_mode in ['uniform', 'lecun_uniform', 'normal', 'zero',
    #                                   'glorot_normal', 'glorot_uniform', 'he_normal',
    #                                   'he_uniform']:
    #                     for activation in ['relu']:
    batch_size = 64
    for epochs in range(5, 6, 5):
        for lr in [1e-4]:
            for opt_name in ['Adam']:
                for init_mode in ['uniform', 'lecun_uniform', 'zero']:
                    for activation in ['relu', 'leakyrelu']:

                        print('>'*80)
                        print('epochs:', epochs, '   lr:', lr, \
                              '   opt:', opt_name, \
                              '   init:', init_mode, \
                              '   act:', 'relu')
                        print('>'*80)

                        model = create_model(learn_rate=lr, 
                                             opt_name=opt_name,
                                             init_mode=init_mode,
                                             activation=activation)

                        history = model.fit(X_train, y_train,
                                            batch_size=batch_size, 
                                            epochs=epochs,
                                            validation_data=(X_test, y_test))

                        mse = model.evaluate(X_test, y_test,
                                             batch_size=batch_size)
                        if (best_mse is None) or \
                            (mse < best_mse):
                            best_mse = mse
                            best_model = model
                            print('best_model:', best_model)
                        
                        print('<'*80)
                        print('epochs:', epochs, \
                              '   lr:', lr, \
                              '   opt:', opt_name, \
                              '   init:', init_mode, \
                              '   act:', 'relu', \
                              ' Test MSE:', mse \
                        )
                        print('<'*80)

In [11]:
model = create_model(learn_rate=0.001)
# model.summary()

Instructions for updating:
Colocations handled automatically by placer.


In [12]:
model.fit(X_train, y_train, batch_size=64, epochs=50, validation_data=(X_test, y_test))

KeyboardInterrupt: 

## Hyperas

In [14]:
import numpy as np

from hyperopt import Trials, STATUS_OK, tpe
from keras.datasets import mnist
from keras.layers.core import Dense, Dropout, Activation
from keras.models import Sequential
from keras.utils import np_utils

from hyperas import optim
from hyperas.distributions import choice, uniform


def data():
    return X_train, y_train, X_test, y_test


def create_model(x_train, y_train, x_test, y_test):
    def rmse(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_true - y_pred)))

    model = Sequential()
    model.add(LSTM({{choice(256)}},  # ,512,1024,2048,4096)}}, 
                   input_shape=(24,169), activation=LeakyReLU(), recurrent_activation=LeakyReLU()), dropout={{uniform(0,1)}})
    model.add(Dense(12*168, activation='relu'))
    model.add(Reshape((12,168)))
    model.compile(loss='mean_squared_error', metrics=[rmse], optimizer={{choice(['adam'])}})
    # , 'rmsprop', sgd'])}})

    result = model.fit(x_train, y_train,
              batch_size={{choice([128])}},
              epochs=10,
              verbose=2,
              validation_split=0.1)

    validation_rmse = np.amin(result.history['val_rmse']) 
    print('Best validation rmse of epoch:', validation_rmse)
    return {'loss': validation_rmse, 'status': STATUS_OK, 'model': model}



best_run, best_model = optim.minimize(model=create_model,
                                        data=data,
                                        algo=tpe.suggest,
                                        max_evals=10,
                                        trials=Trials(),
                                        notebook_name='Chicago_predictions')
X_train, Y_train, X_test, Y_test = data()
print(best_model.evaluate(X_test, Y_test))

>>> Imports:
#coding=utf-8

try:
    from sqlalchemy import create_engine, func, text
except:
    pass

try:
    from sqlalchemy.orm import sessionmaker
except:
    pass

try:
    from decouple import config
except:
    pass

try:
    from shapely import wkb, wkt
except:
    pass

try:
    from shapely.geometry import Point
except:
    pass

try:
    from geoalchemy2.shape import to_shape
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import random
except:
    pass

try:
    import json
except:
    pass

try:
    from datetime import datetime, timedelta
except:
    pass

try:
    import re
except:
    pass

try:
    from matplotlib import pyplot as plt
except:
    pass

try:
    from sklearn.preprocessing import RobustScaler
except:
    pass

try:
    from keras.models import Sequential
except:
    pass

try:
    from keras.layers import LSTM, Dense
except:
    pass

try:
    from sklearn.model_selection import Gr

TypeError: object of type 'int' has no len()

In [None]:
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim

best_run = optim.minimize(model=create_model,
                          data=data,
                          algo=tpe.suggest,
                          max_evals=10,
                          trials=Trials())

In [None]:
with session_scope() as SESSION:
    pred_blockid_dict = {}
    for i, x in enumerate(range(1,802)):
        pred_blockid_dict[x] = i
    X_pred = np.zeros((801, 24, 7*24+1))
    for r in GetData().go(SESSION, 2017, 2018):
        X_pred[pred_blockid_dict[r[0]], 12*(r[1]-2017)+r[2]-1, 24*r[3]+r[4]] = float(r[5])

    for i in range(24):
        X_pred[:, i, -1] = (2017*12+i) / (2000 * 12)
    y_pred = model.predict(X_pred)

In [None]:
y_pred

In [None]:
%%timeit
my_gridsearch(X_train, y_train, X_test, y_test)
print('Best test MSE:', best_mse)
best_model.summary()

In [None]:
y_train_pred = best_model.predict(X_train).flatten()
y_test_pred = best_model.predict(X_test).flatten()

plot_output(y_train, y_train_pred, 'Training')
plot_output(y_test, y_test_pred, 'Testing')

In [None]:
def print_y_stats(s, y):
    num_spaces = 2
    if isinstance(y, pd.core.series.Series):
        y_flat = y
    elif isinstance(y, np.ndarray):
        y_flat = y.flatten()
    else:
        raise ValueError('Could not process type:', type(y))
        
    print(s)
    print(' ' * num_spaces, 'min: ', min(y_flat))
    print(' ' * num_spaces, 'max: ', max(y_flat))
    print(' ' * num_spaces, 'mean:', np.mean(y_flat))
    print(' ' * num_spaces, 'std: ', np.std(y_flat))
    return min(y_flat), max(y_flat), np.mean(y_flat), np.std(y_flat)

In [None]:
_, _, _, y_train_std = print_y_stats('y_train:', y_train)
print()
print_y_stats('y_test:', y_test)

## Store predictions in DB

In [None]:
from decouple import config

def store_predictions_in_db(y_pred):
    
    DB_URI_WRITE  = config('DB_URI_WRITE')

    # Put predictions into pandas DataFrame with corresponding block id
    predictions = pd.DataFrame([[x] for x in pred_blockid_dict.keys()], columns=["id"])

    predictions.loc[:, "prediction"] = predictions["id"].apply(lambda x: y_pred[pred_blockid_dict[x],:,:].astype(np.float64).tobytes().hex())
    predictions.loc[:, "month"] = 0
    predictions.loc[:, "year"] = 2019
    predictions.to_csv("predictions.csv", index=False)

    # Query SQL
    query_commit_predictions = """
    CREATE TEMPORARY TABLE temp_predictions (
        id SERIAL PRIMARY KEY,
        prediction TEXT,
        month INTEGER,
        year INTEGER
    );

    COPY temp_predictions (id, prediction, month, year) FROM STDIN DELIMITER ',' CSV HEADER;

    UPDATE block
    SET 
        prediction = DECODE(temp_predictions.prediction, 'hex'),
        month = temp_predictions.month,
        year = temp_predictions.year 
    FROM temp_predictions
    WHERE block.id = temp_predictions.id;

    DROP TABLE temp_predictions;
    """

    # Open saved predictions and send to database using above query
    with open("predictions.csv", "r") as f:
        print("SENDING TO DB")
        RAW_CONN = create_engine(DB_URI_WRITE).raw_connection()
        cursor = RAW_CONN.cursor()
        cursor.copy_expert(query_commit_predictions, f)
        RAW_CONN.commit()
        RAW_CONN.close()

    for r in SESSION.execute("SELECT ENCODE(prediction::BYTEA, 'hex'), id FROM block WHERE prediction IS NOT NULL LIMIT 5;").fetchall():
        print(np.frombuffer(bytes.fromhex(r[0]), dtype=np.float64).reshape((12,7,24)))
        print(y_pred[pred_blockid_dict[int(r[1])], :].reshape((12,7,24)))

In [None]:
store_predictions_in_db(y_pred)

## Remove outliers

### Let's try filtering values that are 20 standard deviations above the mean

In [None]:
std = np.std(y_train)
y_train_filtered = y_train[y_train < 20*std] # Remove all values larger than 20 standard deviations

std = np.std(y_test)
y_test_filtered = y_test[y_test < 20*std]   # Remove all values larger than 20 standard deviations

print('Number of values filtered from y_train:', len(y_train[y_train > 20*std]))
print('Number of values filtered from y_test:', len(y_test[y_test > 20*std]))
fig = plt.figure(figsize=(12, 8))
plt.plot(y_train_filtered, color='blue');
plt.plot(y_test_filtered, color='red');
plt.legend(labels=['training set data', 'testing set data'], prop={'size': 20})
plt.show()

### With our threshold of 20 * std, we have removed 13 points from y_train and 12 from y_test. This is out of 1.6 million points, so they were defintely outliers. Let's run the prediction again with the updated y values

In [None]:
y_train_updated = np.where(y_train.flatten() < 20*std, y_train.flatten(), [0.]*len(y_train.flatten()))
y_test_updated  = np.where(y_test.flatten() < 20*std, y_test.flatten(), [0.]*len(y_test.flatten()))

predict(X_train, 
        y_train_updated.reshape((801, 12, 168)), 
        X_test, 
        y_test_updated.reshape((801, 12, 168)))

In [None]:
y_train_updated_pred = model.predict(X_train).flatten()
y_test_updated_pred = model.predict(X_test).flatten()

plot_output(y_train_updated, y_train_updated_pred, 'Training')
plot_output(y_test_updated, y_test_updated_pred, 'Testing')

### Distribution of y-values

In [None]:
fig = plt.figure(figsize=(12,8))
pd.Series(y_train_updated).hist(bins=np.arange(0.0001, 0.002, 0.0002));

In [None]:
fig = plt.figure(figsize=(12,8))
pd.Series(y_test_updated).hist(bins=np.arange(0.0001, 0.002, 0.0002));

### This data seems reasonable, although it is a little lopsided. Still, it shouldn't cause the neural network to give us the large error that it is giving. Let's try giving the network the full set of features.

## Use the full set of features

In [None]:
class GetDataFull(object):
    def go(self, SESSION, start_year, end_year):
        SQL_QUERY = \
            f'''
            SELECT incident.blockid, 
                    incident.year, 
                    incident.month, 
                    incident.dow, 
                    incident.hour,
                    SUM(crimetype.severity)/AVG(block.population) AS severity
            FROM incident
            INNER JOIN block ON incident.blockid = block.id INNER JOIN crimetype ON incident.crimetypeid = crimetype.id AND block.population > 0
                AND block.population > 0
                AND severity > 0
                AND incident.cityid = 1
                AND incident.year >= {start_year}
                AND incident.year <= {end_year}
            GROUP BY
                incident.blockid,
                incident.year,
                incident.month,
                incident.dow,
                incident.hour
            '''
        return SESSION.execute(text(SQL_QUERY)).fetchall()

In [None]:
def days_in_month(year, month):
    p = pd.Period(f'{year}-{month}-1')
    return p.days_in_month

def day_of_week(dt):
    return dt.weekday()

def create_arrays(blockids, start_year, end_year):
    idx = 0
    X_blockid, X_year, X_month, X_dow, X_hour, X_risk = [], [], [], [], [], []
    for blockid in blockids:
        for year in range(start_year, end_year + 1):
            for month in range(1, 12 + 1):      # month range is 1-12
                for day in range(1, days_in_month(year, month) + 1):
                    for hour in range(24):      # hour range is 0-23
                        X_blockid.append(blockid)
                        X_year.append(year)
                        X_month.append(month)
                        X_dow.append(day_of_week(datetime(year, month, day)))
                        X_hour.append(hour)
                        X_risk.append(0.0)
                        idx += 1
    
    X = pd.DataFrame({'blockid':  X_blockid,
                      'year':     X_year,
                      'month':    X_month,
                      'dow':      X_dow,
                      'hour':     X_hour,
                      'risk':     X_risk})

    return X

In [None]:
def process_data_full(data, start_year, end_year):

    def remove_outliers_from_risk(risk):
        std = np.std(risk)
        risk = np.where(risk < 20*std, 
                     risk, 
                     [0.]*len(risk)).reshape(risk.shape)

        return risk
    
    NUM_BLOCKIDS = 801
    
    delta_years = end_year - start_year + 1
    
    blockid_dict = {}

    # Create random array (BLOCKIDS) from 1-801 inclusive
    # of length NUM_BLOCKIDS
    BLOCKIDS = random.choices(list(range(1,802)), k=NUM_BLOCKIDS)
    
    for ind, blockid in enumerate(BLOCKIDS):
        blockid_dict[blockid] = ind

    blockids = list(blockid_dict.values())
    X = create_arrays(blockids, start_year, end_year)

    # records is the list of rows we get from the query with this order:
    #   blockid, year, month, dow, hour, risk
    #   month is from 1 - 12

    X1 = []
    for r in data:
        if r[0] in blockid_dict:            
            X1.append((r[0], r[1], r[2], r[3], r[4], r[5]))

    X1 = pd.DataFrame(data=X1,
                      columns=['blockid', 'year', 'month', 'dow', 'hour','risk2'])
    X = pd.merge(X, X1, 
                 how='left',
                 left_on=['blockid', 'year', 'month', 'dow', 'hour'],
                 right_on=['blockid', 'year', 'month', 'dow', 'hour']
                )
    X['all_risk'] = X.risk.astype(float) + X.risk2.astype(float)
    X = X.drop(columns=['risk', 'risk2']) \
         .rename(mapper={'all_risk': 'risk'}, axis=1)
    
    y = X['risk'].copy()
    X = X.drop(columns=['risk']).copy()
    y = remove_outliers_from_risk(y)
    
    return X, y

In [None]:
def ready_data_full(training_start_year, training_end_year,
                    testing_start_year, testing_end_year):
    with session_scope() as session:
        training_data = GetDataFull().go(session,
                                         training_start_year,
                                         training_end_year)
        testing_data = GetDataFull().go(session,
                                         testing_start_year,
                                         testing_end_year)
        X_train, y_train = process_data_full(training_data,
                                             training_start_year, 
                                             training_end_year)
        X_test, y_test = process_data_full(testing_data,
                                           testing_start_year, 
                                           testing_end_year)

    return X_train, X_test, y_train, y_test

In [None]:
%%time
X_train, X_test, y_train, y_test = ready_data_full(2015, 2016, 2017, 2018)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
# def predict_full(X_train, y_train, X_test, y_test):
#     data_dim    = 5 # 7 * 24   # All values in each hour of the week
#     timesteps   = 2 * 12   # Summed per month
#     batch_size  = 64
#     num_outputs = 7 * 24 * 12

#     # expected input data shape: (batch_size, timesteps, data_dim)
#     model = Sequential()
#     model.add(LSTM(1024, return_sequences=True, 
# #                    input_shape=(timesteps, # 24
# #                                 data_dim), # 168
#                    activation='relu',
#                    kernel_initializer='random_uniform',
#                    bias_initializer='zeros'
#                   )
#              )
#     model.add(LSTM(1024, return_sequences=True, input_shape=(timesteps, data_dim)))
#     model.add(LSTM(128, input_shape=(timesteps, data_dim), activation='relu'))
#     model.add(Dense(num_outputs, activation='relu'))
#     model.add(Reshape((12, 7 * 24)))

#     model.compile(loss='mean_squared_error',
#                    optimizer=Adam(lr=0.1))

#     history = model.fit(X_train, y_train,
#                         batch_size=batch_size, epochs=10,
#                         validation_data=(X_test, y_test))

#     mse = model.evaluate(X_test, y_test,
#                          batch_size=batch_size)
#     print('Test MSE:', mse)
#     return history, model

In [None]:
# history, model = predict_full(X_train, y_train, X_test, y_test)