In [1]:
# wangling
import numpy as np
import pandas as pd
import feather

# geospatial
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points

# others
import re
import os
import glob
from datetime import datetime
from tqdm import tqdm_notebook as tqdm

# original
from scripts.utils import exchange_coordinate, epsg_converter, df_2_geodf
from scripts.utils import add_geometry, add_weather

- The code below can get the nearest weahter station data.
- But it is computationally very intensive to calculate the nearest distance between polygon and point (point and point is much faster)
- So, the average of weather stations in SF is used in stead of the nearest value.

In [2]:
# def find_nearest_value(point, pts, geodf2, src_col):
#     """
#     return value of src_col in nearest point in geodf2 
#     """
#     nearest_point = nearest_points(point, pts)[1]
#     value = geodf2.loc[geodf2["geometry"] == nearest_point, src_col].values[0]
    
#     return value

# # add nearest weather station data
# pts = weather.geometry.unary_union
# geodf["prcp"] = geodf[:5].apply(lambda x: find_nearest_value(x.geometry, pts, weather, "PRCP"), axis=1)

# define functions for feature engineering

## datetime

In [3]:
def convert_datetime(geodf):
    """
    return geodf with datetime columns
    """
    geodf["year"] = geodf.datetime.dt.year
    geodf["month"] = geodf.datetime.dt.month
    geodf["woy"] = geodf.datetime.dt.weekofyear 
    geodf["dow"] = geodf.datetime.dt.dayofweek
    geodf["weekend"] = geodf["dow"].apply(lambda x: 1 if x >=5 else 0) 
    geodf["hour"] = geodf.datetime.dt.hour
    
    return geodf

## exponentially weighted mean

- ewm weights more recent trend

In [4]:
def get_ewm(series, alpha, adjust=True, timesteps=1):
    """
    return series with exponential weighted mean
    """
    # shift data to avoid leakage
    ewm = series.shift(timesteps).ewm(alpha=alpha, adjust=adjust).mean()
    
    return ewm

In [5]:
def get_grouped_ewm(geodf, groupby, alpha, adjust=True, timesteps=1):
    """
    return dataframe with exponentaial weighted mean
    """
    # calculate exponential weighted mean by each groupby unit
    # multi-class
    if "incident_type_0" in geodf.columns:
        i_types = ["incident_type_0", "incident_type_1", "incident_type_2"]
        for i_type in i_types:
            roll = geodf.groupby(groupby).apply(lambda x: get_ewm(x[i_type], alpha, adjust, timesteps))
            geodf[i_type + "_ewm_" + str(alpha)] = roll.sort_index(level = [groupby, "datetime"]).values
        
    # binary
    elif "crime" in geodf.columns:
        roll = geodf.groupby(groupby).apply(lambda x: get_ewm(x.crime, alpha, adjust, timesteps))
        geodf["ewm_" + str(alpha)] = roll.sort_index(level = [groupby, "datetime"]).values
    
    return geodf

# add features

In [6]:
# load weather data
weather = pd.read_csv('data/weather.csv')
weather = exchange_coordinate(weather, lon="LONGITUDE", lat="LATITUDE", prefix="station")
crs = {'init':'epsg:4326'}
weather = df_2_geodf(weather, crs, lon="station_lon_fix", lat="station_lat_fix")

# load sf data
sf = gpd.read_file("data/census2010_ sf_tracks.geojson", crs=weather.crs)
sf = sf[['geometry']]

# extract only stations in SF
weather_sf = gpd.sjoin(weather, sf, how="inner", op="intersects")
weather_sf["date"] = pd.to_datetime(weather_sf["DATE"])
weather_sf.set_index("date", inplace=True)

# get average precipitation per day
weather_day = weather_sf.resample("1D").agg({"PRCP": ["mean"],})
weather_day.reset_index(inplace=True)
weather_day.columns = ["date", "prcp"]

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# get all paths of crime counts files
all_paths = glob.glob("data/crime_counts/*.feather")
n_paths = len(all_paths)
all_paths

['data/crime_counts/counts_binary_tract_1H.feather',
 'data/crime_counts/counts_multi-class_tract_2H.feather',
 'data/crime_counts/counts_binary_tract_2H.feather',
 'data/crime_counts/counts_multi-class_tract_1H.feather']

In [12]:
# add features to all crime counts files
for i, path in enumerate(tqdm(all_paths)):
    
    print("------------- {} / {} --------------".format(i+1, n_paths))
    print("[{0:%H:%M%:%S}] {1}".format(datetime.now(), path))

    # load crime counts data
    df = feather.read_dataframe(path)
    df["datetime"] = pd.to_datetime(df["datetime"])
    print("[{0:%H:%M%:%S}] loaded crime counts data {1}".format(datetime.now(), df.shape))

    # convert to geodataframe
    geodf = add_geometry(df, crs)
    print("[{0:%H:%M%:%S}] added geometry {1}".format(datetime.now(), geodf.shape))

    # add precipitation
    geodf = add_weather(geodf, weather_day)
    print("[{0:%H:%M%:%S}] added precipitation {1}".format(datetime.now(), geodf.shape))
    
    # convert datetime
    geodf = convert_datetime(geodf)
    print("[{0:%H:%M%:%S}] converted datetime {1}".format(datetime.now(), geodf.shape))

    # add exponential weighted mean
    if "geoid10_tract" in df.columns:
        groupby = "geoid10_tract"
    elif "geoid10_block" in df.columns:
        groupby = "geoid10_block"
    
#     geodf = get_grouped_ewm(geodf.set_index("datetime"),
#                             groupby=groupby, alpha=0.5,
#                             adjust=True, timesteps=1)
#     geodf.reset_index(inplace=True)
#     print("[{0:%H:%M%:%S}] added exponetial weighted mean {1}".format(datetime.now(), geodf.shape))
    
    # geometry cannot be included in feather format
    geodf.drop(["geometry", "date"], axis=1, inplace=True)
    
    # clean
    geodf.sort_values(by=["datetime", groupby], inplace=True)
    geodf.reset_index(inplace=True, drop=True)
    geodf = geodf[geodf.datetime < datetime(2019, 4, 1)] # precipitation missing values after 2019/04/01
    
    # save
    save_path = re.sub(r".*/counts_", "features/features_", path)
    geodf.to_feather(save_path)
    print("[{0:%H:%M%:%S}] saved at {1}\n".format(datetime.now(), path))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

------------- 1 / 4 --------------
[14:43:54] data/crime_counts/counts_binary_tract_1H.feather
[14:43:55] loaded crime counts data (7334162, 3)
[14:43:57] added geometry (7334162, 4)
[14:44:02] added precipitation (7334162, 6)
[14:44:06] converted datetime (7334162, 12)
[14:44:14] saved at data/crime_counts/counts_binary_tract_1H.feather

------------- 2 / 4 --------------
[14:44:14] data/crime_counts/counts_multi-class_tract_2H.feather
[14:44:14] loaded crime counts data (3667171, 5)
[14:44:15] added geometry (3667171, 6)
[14:44:17] added precipitation (3667171, 8)
[14:44:19] converted datetime (3667171, 14)
[14:44:23] saved at data/crime_counts/counts_multi-class_tract_2H.feather

------------- 3 / 4 --------------
[14:44:23] data/crime_counts/counts_binary_tract_2H.feather
[14:44:23] loaded crime counts data (3667171, 3)
[14:44:24] added geometry (3667171, 4)
[14:44:26] added precipitation (3667171, 6)
[14:44:28] converted datetime (3667171, 12)
[14:44:32] saved at data/crime_counts