In [44]:
# Imports first
import csv
import time
from math import sqrt
import geopandas as gpd
import pandas as pd
import shapely
from shapely.ops import nearest_points
import numpy as np
from scipy import spatial
import pyproj

In [2]:
# Set notebook display options
pd.set_option('display.max_rows', 1000)

In [3]:
# Do housekeeping between runs
objects = ['neighborhoods',
           'schools',
           'schools_df',
           'schools_unary_union',
           'crimes_extract',
           'crimes_header',
           'crimes_df',
           'crimes',
           'police_stations',
           'police_stations_df',
           'libraries',
           'libraries_df'
          ]

for thing in objects:
    try:
        del thing
    except NameError as e:
        print("Couldn't delete {0}".format(thing))
        print(e)
        continue

In [100]:
def find_nearest(row, dest_df=None, geom1_col='geometry', geom2_col='geometry'):
    if dest_df is None:
        raise ValueError('Destination DataFrame (df2) argument not provided')
    # D = spatial.distance_matrix([[pt.x, pt.y] for pt in df1[geom1_col]],
    D = spatial.distance_matrix([[row[geom1_col].x, row[geom1_col].y]],
                                [[pt.x, pt.y] for pt in dest_df[geom2_col]])
    # nn = np.array([[np.min(D[i,]), np.argmin(D[i,])] for i in range(df1[geom1_col].shape[0])])
    nn = np.array([[np.min(D[0,]), np.argmin(D[0,])]])
    #crimesgpd['school distance'] = nn[:,0]
    #crimesgpd['nearest school'] = nn[:,1]
    distance = float(nn[:,0][0])
    nearest_index = int(nn[:,1][0])
    #return pd.Series({'nearest idx': nearest_index, 'distance to nearest': distance})
    return nearest_index, distance

In [7]:
def geod2utm(row):
    '''   Convert geodetic coordinates to UTM   '''
    #if zn == None :
    #    zn = lon2zone (lon)
    zn = '16T'
    datum = 'WGS84'
    lat = row['lat']
    lon = row['lon']
        
    p = pyproj.Proj(proj='utm', zone=zn, ellps=datum)
    
    X, Y = p(lon, lat)
    
    #   Return Y, X, Z
    # return Y, X, elev
    return pd.Series({'UTMx': X, 'UTMy': Y})

In [8]:
def make_utm_points(row):
    UTMx = row['UTMx']
    UTMy = row['UTMy']
    UTMPoint = shapely.geometry.Point(UTMx, UTMy)
    return UTMPoint

# Enhance crimes data with nearest school ID and distance to it

In [9]:
schools_df = pd.read_pickle("schools-transformed.pkl")

In [10]:
neighborhoods = pd.read_pickle("neighborhoods.pkl")

In [11]:
crimes_header = ['ID', 'case number', 'date', 'block', 'iucr', 'primary type', 'desc', 'locdesc',
                 'arrest', 'domestic', 'beat', 'district', 'ward', 'community area', 'fbi code',
                 'x coord', 'y coord', 'year', 'updated on', 'lat', 'lon', 'location'
                ]

In [12]:
crimes1_df = pd.read_csv('crimes-2010-2011-0.csv', names=crimes_header, header=None)
crimes2_df = pd.read_csv('crimes-2010-2011-1.csv', names=crimes_header, header=None)
crimes_df = pd.concat([crimes1_df, crimes2_df], names=crimes_header, ignore_index=True)

In [13]:
crimes_df.dropna(subset=['lat', 'lon'], inplace=True)

In [14]:
crimes_df = (crimes_df.merge(neighborhoods[['community', 'area_numbe']], left_on='community area', right_on='area_numbe', how='left'))
del crimes_df['area_numbe']

In [15]:
crimes_df.rename(columns={'community': 'community name'}, inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 721377 entries, 0 to 721376
Data columns (total 23 columns):
ID                721377 non-null int64
case number       721376 non-null object
date              721377 non-null object
block             721377 non-null object
iucr              721377 non-null object
primary type      721377 non-null object
desc              721377 non-null object
locdesc           721148 non-null object
arrest            721377 non-null bool
domestic          721377 non-null bool
beat              721377 non-null int64
district          721377 non-null int64
ward              721345 non-null float64
community area    721019 non-null float64
fbi code          721377 non-null object
x coord           721377 non-null float64
y coord           721377 non-null float64
year              721377 non-null int64
updated on        721377 non-null object
lat               721377 non-null float64
lon               721377 non-null float64
location          721377 non-n

In [16]:
crimes_df['UTMx'] = np.zeros(len(crimes_df))
crimes_df['UTMy'] = np.zeros(len(crimes_df))
crimes_df.loc[:, ('UTMx', 'UTMy')] = crimes_df.apply(geod2utm, axis=1)

In [17]:
crimes_df['UTMPoint'] = crimes_df.apply(make_utm_points, axis=1)

In [19]:
crimes = gpd.GeoDataFrame(crimes_df, geometry='UTMPoint')

In [20]:
crimes.crs = {'init' :'epsg:2966'}

In [21]:
# Create a small extract of data to test performance of algorithms
crimes_extract = crimes.iloc[0:100]

In [22]:
schools_df.set_index("UNIT_ID")
schools_df['UNIT_ID'] = schools_df.index

Unnamed: 0_level_0,geodesic geometry,SCHOOL_NM,SCHOOL_ID,SCH_ADDR,GRADE_CAT,SCH_TYPE,lon,lat,UNIT_ID,UTMx,UTMy,geometry
UNIT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4610,POINT (-87.5748539059 41.7483660139),MANN,610052,8050 S CHAPPEL AV,ES,Elementary School,-87.574854,41.748366,4610,452204.771983,4621998.0,POINT (452204.7719826915 4621997.860158128)
5180,POINT (-87.57284397069999 41.768650177),OKEEFFE,610103,6940 S MERRILL AV,ES,Elementary School,-87.572844,41.76865,5180,452386.881166,4624249.0,POINT (452386.881166386 4624248.790945017)
5300,POINT (-87.5829920307 41.768594283),PARKSIDE,610116,6938 S EAST END AV,ES,Elementary School,-87.582992,41.768594,5300,451543.358669,4624248.0,POINT (451543.3586693273 4624248.25277694)
5440,POINT (-87.60800008530001 41.740365734),PIRIE,610130,650 E 85TH ST,ES,Elementary School,-87.608,41.740366,5440,449442.603124,4621129.0,POINT (449442.6031235668 4621128.574748854)
5610,POINT (-87.60015764629999 41.764223475),REVERE,610146,1010 E 72ND ST,ES,Elementary School,-87.600158,41.764223,5610,450113.209645,4623773.0,POINT (450113.2096448886 4623772.796898597)


In [23]:
schools = gpd.GeoDataFrame(schools_df)

In [24]:
schools_unary_union = schools.unary_union

In [25]:
crimes_extract = crimes.iloc[0:100,]

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 26 columns):
ID                100 non-null int64
case number       100 non-null object
date              100 non-null object
block             100 non-null object
iucr              100 non-null object
primary type      100 non-null object
desc              100 non-null object
locdesc           100 non-null object
arrest            100 non-null bool
domestic          100 non-null bool
beat              100 non-null int64
district          100 non-null int64
ward              100 non-null float64
community area    100 non-null float64
fbi code          100 non-null object
x coord           100 non-null float64
y coord           100 non-null float64
year              100 non-null int64
updated on        100 non-null object
lat               100 non-null float64
lon               100 non-null float64
location          100 non-null object
community name    100 non-null object
UTMx            

In [26]:
start_time = time.time()
crimes_extract = pd.concat([crimes_extract, pd.DataFrame(crimes_extract.apply(distance_to_nearest,
                                     geom_union=schools_unary_union,
                                     df1=crimes_extract,
                                     df2=schools,
                                     geom1_col='UTMPoint',
                                     geom2_col='geometry',
                                     src_column='UNIT_ID',
                                     axis=1
                                    ).tolist(),
             columns = ['nearest_school_id', 'nearest_school_distance'],
             index=crimes_extract.index)], axis=1)
end_time = time.time()
print("That took {0} seconds".format(end_time - start_time))

That took 1.2240447998046875 seconds


In [28]:
crimes_extract.to_pickle('crimes_extract.pkl')

In [35]:
crimes_extract = crimes_extract.set_geometry('UTMPoint')
schools = schools.set_geometry('geometry')

In [104]:
start_time = time.time()
crimes = pd.concat([crimes, pd.DataFrame(crimes.apply(find_nearest,
                                                      dest_df=schools,
                                                      geom1_col='UTMPoint',
                                                      geom2_col='geometry',
                                                      axis=1
                                                     ).tolist(),
                                        columns=['nearest_school_id', 'nearest_school_distance'],
                                        index=crimes.index)], axis=1)
end_time = time.time()
print("That took {0} seconds".format(end_time - start_time))

That took 7781.184709072113 seconds


In [127]:
crimes.to_pickle('crimes-school-enhanced.pkl')

# Enhance crimes with identity and distance to nearest police station

In [132]:
police_stations_df = pd.read_csv('data/Police_Stations_-_Map.csv')

Unnamed: 0,DISTRICT,ADDRESS,CITY,STATE,ZIP,WEBSITE,LAT,LON
0,1,1718 S State St,Chicago,IL,60616,http://home.chicagopolice.org/community/distri...,41.858373,-87.627356
1,2,5101 S Wentworth Ave,Chicago,IL,60609,http://home.chicagopolice.org/community/distri...,41.801811,-87.63056
2,3,7040 S Cottage Grove Ave,Chicago,IL,60637,http://home.chicagopolice.org/community/distri...,41.766431,-87.605748
3,4,2255 E 103rd St,Chicago,IL,60617,http://home.chicagopolice.org/community/distri...,41.707933,-87.568349
4,5,727 E 111th St,Chicago,IL,60628,http://home.chicagopolice.org/community/distri...,41.692723,-87.604506


In [134]:
police_stations_df['geometry'] = police_stations_df.apply(lambda z: shapely.geometry.Point(z.LAT, z.LON), axis=1)

In [142]:
police_stations_df = pd.read_pickle('data/police-stations-transformed.pkl')
police_stations = gpd.GeoDataFrame(police_stations_df)

In [144]:
start_time = time.time()
crimes = pd.concat([crimes, pd.DataFrame(crimes.apply(find_nearest,
                                                      dest_df=police_stations,
                                                      geom1_col='UTMPoint',
                                                      geom2_col='UTMPoint',
                                                      axis=1
                                                     ).tolist(),
                                        columns=['nearest_station_id', 'nearest_station_distance'],
                                        index=crimes.index)], axis=1)
end_time = time.time()
print("That took {0} seconds".format(end_time - start_time))

That took 349.4852330684662 seconds


# Enhance crimes with identity and distance to nearest library

In [150]:
libraries_df = None
libraries = None
libraries_df = pd.read_pickle('data/libraries.pkl')

In [151]:
libraries = gpd.GeoDataFrame(libraries_df)

In [157]:
start_time = time.time()
crimes = pd.concat([crimes, pd.DataFrame(crimes.apply(find_nearest,
                                                      dest_df=libraries,
                                                      geom1_col='UTMPoint',
                                                      geom2_col='UTMPoint',
                                                      axis=1
                                                     ).tolist(),
                                        columns=['nearest_library_id', 'nearest_library_distance'],
                                        index=crimes.index)], axis=1)
end_time = time.time()
print("That took {0} seconds".format(end_time - start_time))

That took 1031.4299490451813 seconds


In [159]:
crimes.to_pickle('data/crimes-enhanced.pkl')