In [1]:
# Imports first
import csv
from math import sqrt
import geopandas as gpd
import pandas as pd
import shapely
from shapely.ops import nearest_points
import numpy as np
from scipy import ndimage
from scipy.spatial import cKDTree  
import pyproj

import matplotlib
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from matplotlib.colors import ListedColormap
from matplotlib.ticker import MaxNLocator

In [2]:
# Set notebook display options
pd.set_option('display.max_rows', 500)

In [3]:
# Do housekeeping between runs
objects = ['neighborhoods',
           'schools',
           'crimes_extract',
           'crimes_header',
           'crimes']

for thing in objects:
    try:
        del thing
    except NameError as e:
        print("Couldn't delete {0}".format(thing))
        print(e)
        continue

In [118]:
def distance_to_nearest(row, geom_union, df1, df2, geom1_col='geometry', geom2_col='geometry', src_column=None):
#def distance_to_nearest(row, dtn_args):
     """Find the nearest point and return the corresponding value from specified column."""
     # Find the geometry that is closest
     nearest = df2[geom2_col] == nearest_points(row[geom1_col], geom_union)[1]
     print(row)
     print(nearest)
     # Get the corresponding value from df2 (matching is based on the geometry)
     nearest_id = df2[nearest][src_column].get_values()[0]
     start_point: shapely.geometry.Point = row['geometry']
     end_point_series: pd.Series = df2[nearest]['geometry']
     end_point: shapely.geometry.Point = end_point_series.iloc[0]
     dist_to_point = start_point.distance(end_point)
     return nearest_id, dist_to_point

In [4]:
# Define utility functions
def nearest(row, geom_union, df1, df2, geom1_col='geometry', geom2_col='geometry', src_column=None):
    """Find the nearest point and return the corresponding value from specified column."""
    # Find the geometry that is closest
    nearest = df2[geom2_col] == nearest_points(row[geom1_col], geom_union)[1]
    #nearest = df2[geom2_col] == nearest_points(df1, df2)[1]
    # Get the corresponding value from df2 (matching is based on the geometry)
    value = df2[nearest][src_column].get_values()[0]
    return value

In [5]:
def geod2utm(row):
    '''   Convert geodetic coordinates to UTM   '''
    #if zn == None :
    #    zn = lon2zone (lon)
    zn = '16T'
    datum = 'WGS84'
    lat = row['lat']
    lon = row['lon']
        
    p = pyproj.Proj(proj='utm', zone=zn, ellps=datum)
    
    X, Y = p(lon, lat)
    
    #   Return Y, X, Z
    # return Y, X, elev
    return pd.Series({'UTMx': X, 'UTMy': Y})

In [6]:
def make_utm_points(row):
    UTMx = row['UTMx']
    UTMy = row['UTMy']
    UTMPoint = shapely.geometry.Point(UTMx, UTMy)
    return UTMPoint

In [7]:
# Fix column type in neighborhoods dataset
neighborhoods = pd.read_pickle("neighborhoods.pkl")

In [8]:
neighborhoods['area_numbe'] = neighborhoods['area_numbe'].astype('float64')

In [9]:
neighborhoods.to_pickle('neighborhoods-transformed.pkl')

In [10]:
# Enhance school data with UTM coordinates and associated shapely Points
schools = pd.read_pickle("schools.pkl")

In [11]:
schools.rename(columns={'X': 'lon', 'Y': 'lat'}, inplace=True)
schools.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 688 entries, 4610 to 4210
Data columns (total 12 columns):
geometry     688 non-null object
SCHOOL_NM    688 non-null object
SCHOOL_ID    688 non-null int64
SCH_ADDR     688 non-null object
GRADE_CAT    688 non-null object
SCH_TYPE     688 non-null object
lon          688 non-null float64
lat          688 non-null float64
UNIT_ID      688 non-null int64
UTMx         688 non-null float64
UTMy         688 non-null float64
UTMPoint     688 non-null object
dtypes: float64(4), int64(2), object(6)
memory usage: 69.9+ KB


In [12]:
schools['UTMx'] = np.zeros(len(schools))
schools['UTMy'] = np.zeros(len(schools))
schools.loc[:, ('UTMx', 'UTMy')] = schools.apply(geod2utm, axis=1)
schools['UTMPoint'] = schools.apply(make_utm_points, axis=1)
schools.head()

Unnamed: 0_level_0,geometry,SCHOOL_NM,SCHOOL_ID,SCH_ADDR,GRADE_CAT,SCH_TYPE,lon,lat,UNIT_ID,UTMx,UTMy,UTMPoint
UNIT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4610,POINT (-87.5748539059 41.7483660139),MANN,610052,8050 S CHAPPEL AV,ES,Elementary School,-87.574854,41.748366,4610,452204.771983,4621998.0,POINT (452204.7719826915 4621997.860158128)
5180,POINT (-87.57284397069999 41.768650177),OKEEFFE,610103,6940 S MERRILL AV,ES,Elementary School,-87.572844,41.76865,5180,452386.881166,4624249.0,POINT (452386.881166386 4624248.790945017)
5300,POINT (-87.5829920307 41.768594283),PARKSIDE,610116,6938 S EAST END AV,ES,Elementary School,-87.582992,41.768594,5300,451543.358669,4624248.0,POINT (451543.3586693273 4624248.25277694)
5440,POINT (-87.60800008530001 41.740365734),PIRIE,610130,650 E 85TH ST,ES,Elementary School,-87.608,41.740366,5440,449442.603124,4621129.0,POINT (449442.6031235668 4621128.574748854)
5610,POINT (-87.60015764629999 41.764223475),REVERE,610146,1010 E 72ND ST,ES,Elementary School,-87.600158,41.764223,5610,450113.209645,4623773.0,POINT (450113.2096448886 4623772.796898597)


In [13]:
schools.to_pickle('schools-transformed.pkl')

In [14]:
# Enhance crimes data with community name from the neighborhoods dataset and UTM coordinates/Points
crimes_header = ['ID', 'case number', 'date', 'block', 'iucr', 'primary type', 'desc', 'locdesc',
                 'arrest', 'domestic', 'beat', 'district', 'ward', 'community area', 'fbi code',
                 'x coord', 'y coord', 'year', 'updated on', 'lat', 'lon', 'location'
                ]
crimes = pd.read_csv("parallel/xaa", names=crimes_header, header=None)
crimes.dropna(subset=['lat', 'lon'], inplace=True)

In [15]:
crimes = (crimes.merge(neighborhoods[['community', 'area_numbe']], left_on='community area', right_on='area_numbe'))
del crimes['area_numbe']

In [16]:
crimes['UTMx'] = np.zeros(len(crimes))
crimes['UTMy'] = np.zeros(len(crimes))
crimes.loc[:, ('UTMx', 'UTMy')] = crimes.apply(geod2utm, axis=1)
crimes.head()

Unnamed: 0,ID,case number,date,block,iucr,primary type,desc,locdesc,arrest,domestic,...,x coord,y coord,year,updated on,lat,lon,location,community,UTMx,UTMy
0,8265244,HT498458,09/15/2011 08:00:00 AM,040XX N MENARD AVE,620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE,True,False,...,1136960.0,1926315.0,2011,02/12/2018 03:46:59 PM,41.953965,-87.771918,"(41.95396528, -87.771918163)",PORTAGE PARK,436025.308686,4644953.0
1,7357772,HS159408,07/01/2001 10:00:00 AM,056XX W CORNELIA AVE,840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,False,False,...,1138173.0,1922746.0,2001,02/18/2010 01:12:55 AM,41.94415,-87.767546,"(41.9441497, -87.767545572)",PORTAGE PARK,436377.940882,4643860.0
2,7609828,HS413731,07/13/2010 12:00:00 PM,041XX N LONG AVE,820,THEFT,$500 AND UNDER,PARK PROPERTY,False,False,...,1139616.0,1926816.0,2010,02/04/2016 06:33:39 AM,41.955292,-87.762142,"(41.95529191, -87.762141971)",PORTAGE PARK,436836.850742,4645093.0
3,7610439,HS413034,07/16/2010 03:00:00 AM,056XX W SCHOOL ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,...,1138176.0,1921375.0,2010,02/04/2016 06:33:39 AM,41.940387,-87.767568,"(41.940387482, -87.767567805)",PORTAGE PARK,436372.357517,4643442.0
4,7610466,HS414753,07/17/2010 12:25:00 AM,032XX N CICERO AVE,1506,PROSTITUTION,SOLICIT ON PUBLIC WAY,ALLEY,True,False,...,1143808.0,1920986.0,2010,02/04/2016 06:33:39 AM,41.939216,-87.746878,"(41.939216164, -87.746877814)",PORTAGE PARK,438086.334871,4643297.0


In [17]:
crimes['UTMPoint'] = crimes.apply(make_utm_points, axis=1)

In [18]:
crimes.head()

Unnamed: 0,ID,case number,date,block,iucr,primary type,desc,locdesc,arrest,domestic,...,y coord,year,updated on,lat,lon,location,community,UTMx,UTMy,UTMPoint
0,8265244,HT498458,09/15/2011 08:00:00 AM,040XX N MENARD AVE,620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE,True,False,...,1926315.0,2011,02/12/2018 03:46:59 PM,41.953965,-87.771918,"(41.95396528, -87.771918163)",PORTAGE PARK,436025.308686,4644953.0,POINT (436025.3086861699 4644953.17679981)
1,7357772,HS159408,07/01/2001 10:00:00 AM,056XX W CORNELIA AVE,840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,False,False,...,1922746.0,2001,02/18/2010 01:12:55 AM,41.94415,-87.767546,"(41.9441497, -87.767545572)",PORTAGE PARK,436377.940882,4643860.0,POINT (436377.940881997 4643860.108357577)
2,7609828,HS413731,07/13/2010 12:00:00 PM,041XX N LONG AVE,820,THEFT,$500 AND UNDER,PARK PROPERTY,False,False,...,1926816.0,2010,02/04/2016 06:33:39 AM,41.955292,-87.762142,"(41.95529191, -87.762141971)",PORTAGE PARK,436836.850742,4645093.0,POINT (436836.8507415869 4645093.21940705)
3,7610439,HS413034,07/16/2010 03:00:00 AM,056XX W SCHOOL ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,...,1921375.0,2010,02/04/2016 06:33:39 AM,41.940387,-87.767568,"(41.940387482, -87.767567805)",PORTAGE PARK,436372.357517,4643442.0,POINT (436372.3575166143 4643442.41025281)
4,7610466,HS414753,07/17/2010 12:25:00 AM,032XX N CICERO AVE,1506,PROSTITUTION,SOLICIT ON PUBLIC WAY,ALLEY,True,False,...,1920986.0,2010,02/04/2016 06:33:39 AM,41.939216,-87.746878,"(41.939216164, -87.746877814)",PORTAGE PARK,438086.334871,4643297.0,POINT (438086.3348712226 4643297.20981963)


In [19]:
# Create a small extract of data to test performance of algorithms
crimes_extract = crimes.iloc[0:100]

In [39]:
schools.set_index("UNIT_ID")
schools['UNIT_ID'] = schools.index
schools.head()

Unnamed: 0_level_0,geometry,SCHOOL_NM,SCHOOL_ID,SCH_ADDR,GRADE_CAT,SCH_TYPE,lon,lat,UNIT_ID,UTMx,UTMy,UTMPoint
UNIT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4610,POINT (-87.5748539059 41.7483660139),MANN,610052,8050 S CHAPPEL AV,ES,Elementary School,-87.574854,41.748366,4610,452204.771983,4621998.0,POINT (452204.7719826915 4621997.860158128)
5180,POINT (-87.57284397069999 41.768650177),OKEEFFE,610103,6940 S MERRILL AV,ES,Elementary School,-87.572844,41.76865,5180,452386.881166,4624249.0,POINT (452386.881166386 4624248.790945017)
5300,POINT (-87.5829920307 41.768594283),PARKSIDE,610116,6938 S EAST END AV,ES,Elementary School,-87.582992,41.768594,5300,451543.358669,4624248.0,POINT (451543.3586693273 4624248.25277694)
5440,POINT (-87.60800008530001 41.740365734),PIRIE,610130,650 E 85TH ST,ES,Elementary School,-87.608,41.740366,5440,449442.603124,4621129.0,POINT (449442.6031235668 4621128.574748854)
5610,POINT (-87.60015764629999 41.764223475),REVERE,610146,1010 E 72ND ST,ES,Elementary School,-87.600158,41.764223,5610,450113.209645,4623773.0,POINT (450113.2096448886 4623772.796898597)


In [112]:
#schools2 = schools
#len(schools2)
#df = df[~df['A'].apply(tuple).duplicated()]
#schools3 = schools2[~schools2[]]
#schools2[schools2.groupby(['UNIT_ID', 'SCHOOL_NM', 'SCHOOL_ID', 'UTMx', 'UTMy']).count() > 1]
#schools2.drop_duplicates(subset=['UNIT_ID', 'SCHOOL_ID']).count()
#schools2[schools2.duplicated(subset=['UTMx', 'UTMy'])].sort_values('SCHOOL_ID')
#len(schools2)

Unnamed: 0_level_0,geometry,SCHOOL_NM,SCHOOL_ID,SCH_ADDR,GRADE_CAT,SCH_TYPE,lon,lat,UNIT_ID,UTMx,UTMy,UTMPoint
UNIT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
6655,POINT (-87.6353490959 41.7707452934),AMANDLA,400012,6800 S STEWART AV,HS,Charter,-87.635349,41.770745,6655,447193.332704,4624518.0,POINT (447193.3327041139 4624517.890699782)
7020,POINT (-87.76321129540001 41.8851821631),AUSTIN BUS & ENTRP HS,400018,231 N PINE AV,HS,Contract,-87.763211,41.885182,7020,436678.950463,4637310.0,POINT (436678.9504628467 4637309.836276419)
4370,POINT (-87.7080245074 41.8584605337),CATALYST,400020,1616 S SPAULDING AV,ES,Charter,-87.708025,41.858461,4370,441233.165597,4634304.0,POINT (441233.1655973321 4634303.755934255)
8047,POINT (-87.618372173 41.8363986498),CHICAGO ARTS HS,400022,3200 S. CALUMET AV,HS,Contract,-87.618372,41.836399,8047,448656.80773,4631797.0,POINT (448656.8077300153 4631796.821226656)
7810,POINT (-87.72133536530001 41.8583288116),KIPP ASCEND CHARTER,400044,1616 S AVERS AV,MS,Charter,-87.721335,41.858329,7810,440128.220031,4634298.0,POINT (440128.2200308657 4634298.327671785)
5870,POINT (-87.7305482714 41.85685112239999),LEGACY CHARTER,400049,4217 W 18TH ST,ES,Charter,-87.730548,41.856851,5870,439362.136215,4634141.0,POINT (439362.1362152169 4634140.729628502)
1933,POINT (-87.66453430780001 41.8953088467),NOBLE - GOLDER,400053,1454 W. SUPERIOR ST,HS,Charter,-87.664534,41.895309,1933,444874.59622,4638366.0,POINT (444874.5962195973 4638366.07121895)
1962,POINT (-87.65225497829999 41.7459401482),PERSPECTIVES - CALUMET TECHNOLOGY HS,400062,8131 S MAY ST,HS,Charter,-87.652255,41.74594,1962,445767.322694,4621774.0,POINT (445767.3226936422 4621774.417733593)
1963,POINT (-87.65225497829999 41.7459401482),PERSPECTIVES - MIDDLE SCHOOL,400067,8131 S MAY ST,MS,Charter,-87.652255,41.74594,1963,445767.322694,4621774.0,POINT (445767.3226936422 4621774.417733593)
4140,POINT (-87.62448455090001 41.8045795662),SHABAZZ CHTR - DUSABLE,400073,4934 S WABASH AV,HS,Charter,-87.624485,41.80458,4140,448123.621404,4628268.0,POINT (448123.6214036231 4628267.743629881)


In [113]:
schools_unary_union = schools.unary_union

In [119]:
crimes_extract['nearest_id'] = crimes_extract.apply(distance_to_nearest,
                                                    geom_union=schools_unary_union,
                                                    df1=crimes_extract,
                                                    df2=schools,
                                                    src_column='UNIT_ID',
                                                    geom1_col='UTMPoint',
                                                    geom2_col='UTMPoint',
                                                    axis=1)
crimes_extract.head()

ID                                                   8265244
case number                                         HT498458
date                                  09/15/2011 08:00:00 AM
block                                     040XX N MENARD AVE
iucr                                                    0620
primary type                                        BURGLARY
desc                                          UNLAWFUL ENTRY
locdesc                                            RESIDENCE
arrest                                                  True
domestic                                               False
beat                                                    1624
district                                                  16
ward                                                      38
community area                                            15
fbi code                                                  05
x coord                                          1.13696e+06
y coord                 

IndexError: ('index 0 is out of bounds for axis 0 with size 0', 'occurred at index 0')

In [132]:
my_nearest_point = schools['UTMPoint'] == nearest_points(crimes.iloc[0]['UTMPoint'], schools_unary_union)[1]
print(my_nearest_point)
schools['UTMPoint' == my_nearest_point]

UNIT_ID
4610    False
5180    False
5300    False
5440    False
5610    False
5670    False
5880    False
6100    False
6350    False
6900    False
1135    False
2123    False
2123    False
1105    False
6320    False
6450    False
6570    False
7190    False
7610    False
5640    False
6490    False
4230    False
4310    False
4430    False
4430    False
4710    False
5430    False
5970    False
7720    False
4330    False
6710    False
2940    False
4650    False
5370    False
5480    False
6750    False
6760    False
2510    False
2590    False
2910    False
2970    False
3260    False
3370    False
3380    False
3520    False
3630    False
3690    False
3760    False
3900    False
3980    False
4250    False
4320    False
4720    False
5390    False
6310    False
6440    False
6930    False
7490    False
7880    False
8021    False
2580    False
2830    False
3040    False
3110    False
3530    False
3720    False
4350    False
4570    False
6880    False
6880    False
7050    Fals

  result = method(y)


TypeError: invalid type comparison

In [20]:
crimes.to_pickle('crimes-transformed.pkl')