In [46]:
from shapely.geometry import asShape, Point
import pandas as pd
import numpy as np
import geopandas as gpd
df_geo_list = gpd.read_file('data/geo_list.json')
geo_list = df_geo_list.to_dict('records')
len(geo_list)

158

In [65]:
import pandas as pd
# load crime data from saved file
df_crime_raw = pd.read_csv('data/df_crime_raw.csv')
df_crime = gpd.GeoDataFrame(
    df_crime_raw, geometry=gpd.points_from_xy(df_crime_raw.lng, df_crime_raw.lat))

In [66]:
df_crime = df_crime[['cartodb_id', 'dispatch_date', 'text_general_code', 'geometry']]
df_crime.rename(columns= {'cartodb_id': 'id', 'dispatch_date': 'date', 'text_general_code': 'type'}, inplace= True)
print(df_crime.shape)
df_crime.head(1)

(361670, 4)


Unnamed: 0,id,date,type,geometry
0,38,2019-11-12,Thefts,POINT (-75.06663 40.04926)


In [86]:
print(df_crime.shape)
# df_crime_test = df_crime.loc[0:100000]
df_crime_test = df_crime
print(df_crime_test.shape)

(361670, 4)
(361670, 4)


In [68]:
# find neighborhood for crime data from longitude, latitude
def get_neighborhood_from_geo(point):
    # lat, lng = map(np.deg2rad, [latitude, longitude])
    for row in geo_list:
        # shape = asShape(row.geometry)
        # point = Point(lng, lat) # longitude, latitude
        if asShape(row['geometry']).contains(point):
            return row['neighborhood']

## 1. Pandas dataframe with iterrows, itertuples, iteritems

In [69]:
%%time
neigh_list=[]
for row in df_crime_test.itertuples():
     neigh_list.append(get_neighborhood_from_geo(row.geometry))
df_crime_test['neighborhood'] = neigh_list
print(df_crime_test.shape)
df_crime_test.head(2)

(1001, 5)
CPU times: user 22.6 s, sys: 506 ms, total: 23.1 s
Wall time: 23.5 s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


Unnamed: 0,id,date,type,geometry,neighborhood
0,38,2019-11-12,Thefts,POINT (-75.06663 40.04926),Oxford Circle
1,46,2019-01-19,Thefts,POINT (-75.16145 39.96233),Spring Garden


## 2. The `apply` method with `lambdas`

In [70]:
%%time
df_crime_test.drop('neighborhood', axis=1, inplace=True)
# loop by lambdas
df_crime_test['neighborhood'] = df_crime_test.apply(lambda x: get_neighborhood_from_geo(x['geometry']), axis=1)
print(df_crime_test.shape)
df_crime_test.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
(1001, 5)
CPU times: user 21.9 s, sys: 425 ms, total: 22.3 s
Wall time: 22.6 s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


Unnamed: 0,id,date,type,geometry,neighborhood
0,38,2019-11-12,Thefts,POINT (-75.06663 40.04926),Oxford Circle
1,46,2019-01-19,Thefts,POINT (-75.16145 39.96233),Spring Garden


## 3. Pandas Vectorization 

In [71]:
%%time
df_crime_test.drop('neighborhood', axis=1, inplace=True)
df_crime_test['neighborhood'] = df_crime_test['geometry'].apply(lambda x: get_neighborhood_from_geo(x))
df_crime_test.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
CPU times: user 23.5 s, sys: 579 ms, total: 24.1 s
Wall time: 24.5 s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


Unnamed: 0,id,date,type,geometry,neighborhood
0,38,2019-11-12,Thefts,POINT (-75.06663 40.04926),Oxford Circle
1,46,2019-01-19,Thefts,POINT (-75.16145 39.96233),Spring Garden


In [87]:
%%time
df_join=gpd.sjoin(df_crime_test, df_geo_list, how='left',op="within")

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

CPU times: user 12.6 s, sys: 393 ms, total: 13 s
Wall time: 13.2 s


In [88]:
print(df_join.shape)
df_join.head(2)

(361670, 6)


Unnamed: 0,id,date,type,geometry,index_right,neighborhood
0,38,2019-11-12,Thefts,POINT (-75.06663 40.04926),98.0,Oxford Circle
1,46,2019-01-19,Thefts,POINT (-75.16145 39.96233),123.0,Spring Garden


In [89]:
len(df_join[df_join["neighborhood"].isnull()])

4145