In [1]:
# wangling
import numpy as np
import pandas as pd
import feather

# geospatial
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points

# others
import re
import os
import glob
from datetime import datetime
from tqdm import tqdm_notebook as tqdm

# load data with features

In [2]:
df = feather.read_dataframe("features/features_binary_tract_2H.feather")
df.sort_values(by=["datetime", "geoid10_tract"], inplace=True)
df.set_index("datetime", inplace=True)
print(df.shape)
df.head()

(3626217, 9)


Unnamed: 0_level_0,geoid10_tract,crime,prcp,year,month,woy,dow,weekend,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-01,6075010100,1,0.0,2015,1,1,3,0,0
2015-01-01,6075010200,1,0.0,2015,1,1,3,0,0
2015-01-01,6075010400,1,0.0,2015,1,1,3,0,0
2015-01-01,6075010500,1,0.0,2015,1,1,3,0,0
2015-01-01,6075010600,1,0.0,2015,1,1,3,0,0


In [3]:
df.dtypes

geoid10_tract     object
crime              int64
prcp             float64
year               int64
month              int64
woy                int64
dow                int64
weekend            int64
hour               int64
dtype: object

In [4]:
df.isnull().sum()

geoid10_tract    0
crime            0
prcp             0
year             0
month            0
woy              0
dow              0
weekend          0
hour             0
dtype: int64

# transformation for RNN input

In [5]:
# set configuration
window = 10

In [6]:
def get_sequences(df, window):
    """
    return sequences with given window size
    """
    # convert to ndarry
    values = df.values
    sequences = []
    
    # store each sequences rolling window
    for i in range(window, len(df)):
        seq = values[i-window:i+1]
        sequences.append(seq)
        
    return sequences

In [7]:
# output columns
if "crime" in df.columns:
    y_cols = ["crime"]
elif "incident_type_1" in df.columns:
    y_cols = ["incident_type_0", "incident_type_1", "incident_type_2"]

# geo column
if "geoid10_tract" in df.columns:
    geo_col = ["geoid10_tract"]
elif "geoid10_block" in df.columns:
    geo_col = ["geoid10_block"]
    
# input columns
x_cols = list(df.drop(y_cols + geo_col, axis=1).columns)

In [8]:
geo_grs = df.groupby(by=geo_col)

In [9]:
# arrayes to store x and y
# (window size, input size, no of timesteps, no of tracts)
x_all = np.empty(shape=(window, len(x_cols + y_cols), len(geo_grs), len(df)-window))

# (output size, no of timesteps, no of tracts)
y_all = np.empty(shape=(len(y_cols), len(geo_grs), len(df)-window,))

for i, (_, gr) in enumerate(tqdm(geo_grs)):
    x_values = gr[y_cols + x_cols].values
    y_values = gr[y_cols].values
    
    for j in range(window, len(gr)):
        x_all[:,:,i,j] = x_values[j-window:j, :]
        y_all[:,i,j] = y_values[j, :]

HBox(children=(IntProgress(value=0, max=195), HTML(value='')))


