In [1]:
# Setting up modules
import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import Point
import matplotlib.pylab as plt
import geopy

In [2]:
# Will change this once I find the csv file for 2013-2017 data
crime = pd.read_csv('data/NYPD_Complaint_Data_2018to19.csv')
crime.head()

Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,HADEVELOPT,HOUSING_PSA,...,SUSP_SEX,TRANSIT_DISTRICT,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon
0,857927015,10.0,MANHATTAN,01/29/2019,16:37:00,01/29/2019,16:45:00,COMPLETED,,,...,M,,UNKNOWN,UNKNOWN,M,984140.0,211709.0,40.747777,-74.000398,"(40.747777093, -74.000398443)"
1,479254687,101.0,QUEENS,03/29/2019,17:00:00,03/29/2019,17:10:00,COMPLETED,,,...,M,,25-44,BLACK,F,1054076.0,157437.0,40.598538,-73.74856,"(40.598537593, -73.748559596)"
2,320007604,41.0,BRONX,02/06/2019,02:00:00,,,COMPLETED,,,...,M,,UNKNOWN,UNKNOWN,D,1011589.0,237996.0,40.819886,-73.901227,"(40.819885621, -73.901226998)"
3,746022144,68.0,BROOKLYN,01/08/2019,22:49:00,01/08/2019,22:52:00,COMPLETED,,,...,M,,UNKNOWN,UNKNOWN,E,984439.0,166855.0,40.624663,-73.999321,"(40.624663129, -73.999320591)"
4,145366108,25.0,MANHATTAN,02/11/2019,12:07:00,,,COMPLETED,,,...,,,UNKNOWN,UNKNOWN,E,1002776.0,231994.0,40.803435,-73.933084,"(40.80343468, -73.93308448)"


In [3]:
crime.shape

(220998, 35)

In [4]:
nta = gpd.read_file('data/Neighborhood Tabulation Areas.geojson')
nta.head()

Unnamed: 0,ntacode,shape_area,county_fips,ntaname,shape_leng,boro_name,boro_code,geometry
0,BK88,54005018.7472,47,Borough Park,39247.2280737,Brooklyn,3,(POLYGON ((-73.97604935657381 40.6312759056467...
1,QN51,52488276.477,81,Murray Hill,33266.904811,Queens,4,(POLYGON ((-73.80379022888246 40.7756101117924...
2,QN27,19726845.26,81,East Elmhurst,19816.7115378,Queens,4,(POLYGON ((-73.86109724401859 40.7636644770877...
3,QN07,22887772.7683,81,Hollis,20976.3358374,Queens,4,(POLYGON ((-73.75725671509139 40.7181386016625...
4,MN06,10647077.6122,61,Manhattanville,17040.6865482,Manhattan,1,(POLYGON ((-73.94607828608069 40.8212632160616...


In [5]:
# make sure lat long variables are numeric
crime[['Latitude', 'Longitude']].dtypes 

Latitude     float64
Longitude    float64
dtype: object

In [None]:
# data cleaning, we wanna remove entries where lat long are null before we do spatial join

In [29]:
# check out how many rows with lat, long columns each as null
crime[['Latitude', 'Longitude']].isnull().apply(pd.value_counts)

Unnamed: 0,Latitude,Longitude
False,220978,220978
True,20,20


In [31]:
# the offending 20 rows
crime[crime.Latitude.isnull()]

Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,HADEVELOPT,HOUSING_PSA,...,TRANSIT_DISTRICT,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon,geometry
14338,507845354,41.0,BRONX,01/27/2019,12:42:00,01/27/2019,12:42:00,COMPLETED,,,...,,UNKNOWN,UNKNOWN,E,,,,,,POINT (nan nan)
21458,325973169,41.0,BRONX,03/27/2018,02:30:00,03/29/2018,12:20:00,COMPLETED,,,...,,18-24,WHITE HISPANIC,M,,,,,,POINT (nan nan)
22297,725167396,41.0,BRONX,01/23/2018,12:28:00,01/24/2018,12:00:00,COMPLETED,,,...,,25-44,BLACK,M,,,,,,POINT (nan nan)
69074,365998895,40.0,BRONX,12/24/2018,18:30:00,12/24/2018,18:40:00,COMPLETED,,,...,,UNKNOWN,UNKNOWN,F,,,,,,POINT (nan nan)
70480,672048068,84.0,BROOKLYN,12/20/2018,20:05:00,12/20/2018,20:05:00,COMPLETED,,,...,,UNKNOWN,UNKNOWN,E,,,,,,POINT (nan nan)
78996,388780984,41.0,BRONX,02/09/2019,16:00:00,02/09/2019,17:00:00,COMPLETED,,,...,,45-64,WHITE HISPANIC,M,,,,,,POINT (nan nan)
101246,506222419,41.0,BRONX,04/10/2018,07:30:00,04/14/2018,12:00:00,COMPLETED,,,...,,25-44,BLACK,M,,,,,,POINT (nan nan)
103584,436213844,110.0,QUEENS,01/18/2019,01:00:00,01/18/2019,06:00:00,COMPLETED,,,...,,45-64,BLACK,F,,,,,,POINT (nan nan)
103790,807134687,84.0,BROOKLYN,03/17/2019,09:51:00,03/17/2019,09:51:00,COMPLETED,,,...,,UNKNOWN,UNKNOWN,E,,,,,,POINT (nan nan)
114611,268323397,,,06/17/2019,08:37:00,,,COMPLETED,,,...,,45-64,WHITE HISPANIC,M,,,,,,POINT (nan nan)


In [33]:
# drop the offending rows
crime = crime.dropna(subset=['Latitude', 'Longitude'])
crime.shape # 20 fewer rows

(220978, 36)

In [34]:
# Create crime geodataframe
crime['geometry'] = crime.apply(lambda crime: Point((crime.Longitude, crime.Latitude)), axis=1)
crime_gdf = gpd.GeoDataFrame(crime, geometry = 'geometry')

In [35]:
crime_gdf.head(2)

Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,HADEVELOPT,HOUSING_PSA,...,TRANSIT_DISTRICT,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon,geometry
0,857927015,10.0,MANHATTAN,01/29/2019,16:37:00,01/29/2019,16:45:00,COMPLETED,,,...,,UNKNOWN,UNKNOWN,M,984140.0,211709.0,40.747777,-74.000398,"(40.747777093, -74.000398443)",POINT (-74.00039844299994 40.74777709300002)
1,479254687,101.0,QUEENS,03/29/2019,17:00:00,03/29/2019,17:10:00,COMPLETED,,,...,,25-44,BLACK,F,1054076.0,157437.0,40.598538,-73.74856,"(40.598537593, -73.748559596)",POINT (-73.74855959599995 40.59853759300007)


In [36]:
crime_gdf.crs = nta.crs # ensure they are on the same CRS projection
crime_nta = gpd.sjoin(crime_gdf, nta, op="intersects")

In [37]:
crime_nta.head(2)

Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,HADEVELOPT,HOUSING_PSA,...,Lat_Lon,geometry,index_right,ntacode,shape_area,county_fips,ntaname,shape_leng,boro_name,boro_code
0,857927015,10.0,MANHATTAN,01/29/2019,16:37:00,01/29/2019,16:45:00,COMPLETED,,,...,"(40.747777093, -74.000398443)",POINT (-74.00039844299994 40.74777709300002),135,MN13,37068382.0503,61,Hudson Yards-Chelsea-Flatiron-Union Square,45693.0741371,Manhattan,1
17,560939215,13.0,MANHATTAN,01/11/2019,02:30:00,,,ATTEMPTED,,,...,"(40.739773188, -73.992986319)",POINT (-73.99298631899995 40.73977318800007),135,MN13,37068382.0503,61,Hudson Yards-Chelsea-Flatiron-Union Square,45693.0741371,Manhattan,1


In [42]:
# sanity check for spatial join
(crime_nta.BORO_NM == crime_nta.boro_name.str.upper()).value_counts()

# 705 rows with different boroughs specified?

True     220257
False       705
dtype: int64

In [51]:
error_df = crime_nta[['CMPLNT_FR_DT', 'BORO_NM', 'boro_name']][~(crime_nta.BORO_NM == crime_nta.boro_name.str.upper())]
error_df.head()
# some instances where crime dataset doesn't specify borough
# other instances where crime dataset

Unnamed: 0,CMPLNT_FR_DT,BORO_NM,boro_name
46145,01/31/2019,BROOKLYN,Queens
134463,05/19/2019,,Queens
116582,06/30/2019,,Bronx
7232,03/16/2019,,Manhattan
41651,01/19/2019,,Manhattan


In [52]:
# num of rows where crime dataset's borough is diff from nta dataset's borough
num_diff_boro = len(error_df[~error_df.BORO_NM.isnull()])
print(f"num of rows with different/wrong boroughs={num_diff_boro}")
print(f"that's {num_diff_boro/len(crime_nta)}% of dataset")

num of rows with different/wrong boroughs=576
that's 0.002606783066771662% of dataset


In [None]:
# do we want to discard those with diff boroughs?

In [55]:
# save gdf as csv 
crime_nta.to_csv('data/Crime_NTA.csv')
# crime_nta.to_file("data/crime_NTA.geojson", driver='GeoJSON') # if we wanna save as geojson