In [23]:
import pandas as pd

In [None]:
# TODO: make a combined dataset for NYRP and NYC greenthumb

In [24]:
gardens = pd.read_csv('../data/NYC_Greenthumb_Community_Gardens.csv')
gardens.head(2)

# good to know that the dataset already has NTA name and borough

Unnamed: 0,PropID,Boro,Community Board,Council District,Garden Name,Address,Size,Jurisdiction,NeighborhoodName,Cross Streets,Latitude,Longitude,Postcode,Census Tract,BIN,BBL,NTA
0,,M,M03,2.0,11 BC Serenity Garden,626 East 11th Street,0.054,DPR,East Village,Avenues B & C,40.727124,-73.978677,10009.0,28.0,1000000.0,1003930020.0,Lower East Side ...
1,,B,B08,36.0,1100 Bergen Street Community Garden,1101 & 1105 Bergen Street,0.207,PRI,Crown Heights,Nostrand & New York Avenues,,,,,,,


Data cleaning 1: remove whitespace in NTA column

In [25]:
# before cleaning
gardens.NTA.unique()[:5] # check first 5 elements

array(['Lower East Side                                                            ',
       nan,
       'East Harlem South                                                          ',
       'East Harlem North                                                          ',
       'East Village                                                               '],
      dtype=object)

In [26]:
# remove trailing whitespace in NTA names
gardens['NTA'] = gardens.NTA.str.strip() 
gardens.NTA.unique()[:5] 

array(['Lower East Side', nan, 'East Harlem South', 'East Harlem North',
       'East Village'], dtype=object)

Data cleaning 2: replace boro column with actual names

In [27]:
# before cleaning
gardens.Boro.unique()

array(['M', 'B', 'X', 'Q', 'R'], dtype=object)

In [28]:
# replace boro initial with actual borough name
boroughs = {
    'M': 'Manhattan', 
    'B': 'Brooklyn',
    'Q': 'Queens',
    'X': 'Bronx',
    'R': 'Staten Island'
}

gardens['Boro'].replace(boroughs, inplace=True)
gardens.head(2)

Unnamed: 0,PropID,Boro,Community Board,Council District,Garden Name,Address,Size,Jurisdiction,NeighborhoodName,Cross Streets,Latitude,Longitude,Postcode,Census Tract,BIN,BBL,NTA
0,,Manhattan,M03,2.0,11 BC Serenity Garden,626 East 11th Street,0.054,DPR,East Village,Avenues B & C,40.727124,-73.978677,10009.0,28.0,1000000.0,1003930020.0,Lower East Side
1,,Brooklyn,B08,36.0,1100 Bergen Street Community Garden,1101 & 1105 Bergen Street,0.207,PRI,Crown Heights,Nostrand & New York Avenues,,,,,,,


Data cleaning 3: check if NTA column in this dataset is an accurate subset of NTA dataset

In [29]:
import geopandas as gpd
nta = gpd.read_file('../data/Neighborhood Tabulation Areas.geojson')
nta.head(2)

Unnamed: 0,ntacode,shape_area,county_fips,ntaname,shape_leng,boro_name,boro_code,geometry
0,BK88,54005018.7472,47,Borough Park,39247.2280737,Brooklyn,3,(POLYGON ((-73.97604935657381 40.6312759056467...
1,QN51,52488276.477,81,Murray Hill,33266.904811,Queens,4,(POLYGON ((-73.80379022888246 40.7756101117924...


In [30]:
nta.ntaname.unique()[:5] # check first 5 elements, no whitespace

array(['Borough Park', 'Murray Hill', 'East Elmhurst', 'Hollis',
       'Manhattanville'], dtype=object)

In [31]:
# not all NTAs in the community gardens dataset is inside the NTA dataset
garden_set = set(gardens.NTA.unique())
nta_set = set(nta.ntaname.unique())
garden_set.issubset(nta_set)

False

In [32]:
# find out which NTAs are in gardens but not in NTA...turns out to be relatively trivial
garden_set.difference(nta_set)

{'', nan}

In [33]:
# check how many records have nta == '' or nan
invalid_num = len(gardens[gardens.NTA == '']) + len(gardens[gardens.NTA.isnull()])
print(f"{invalid_num} data records with invalid NTAs")
print(f"that's {round(invalid_num/len(gardens), 2)*100}% of the dataset")

107 data records with invalid NTAs
that's 20.0% of the dataset


Data cleaning 4: fill in missing/null NTA info

In [98]:
gardens[gardens.NTA.isnull()].shape
# 105 gardens without nta info

(105, 17)

Data cleaning 5: combining nyrp dataset with greenthumb dataset

In [39]:
nyrp = pd.read_csv('../data/NYRP_NTA.csv', index_col=0)
nyrp.head(2)

Unnamed: 0,address,coords,lat,long,borough,geometry,index_right,ntacode,shape_area,county_fips,ntaname,shape_leng,boro_name,boro_code
0,735 East 211th St,"40.877499,-73.863489",40.877499,-73.863489,Bronx,POINT (-73.863489 40.877499),75,BX44,36273600.0,5,Williamsbridge-Olinville,27351.077379,Bronx,2
1,1818 Bathgate Ave,"40.845051,-73.897747",40.845051,-73.897747,Bronx,POINT (-73.897747 40.845051),128,BX01,16451620.0,5,Claremont-Bathgate,29972.77772,Bronx,2


In [40]:
# rename columns for concat
nyrp.rename(columns={
    'address': 'Address',
    'lat': 'Latitude',
    'long': 'Longitude',
    'borough': 'Borough',
    'ntaname': 'NTA',
    'ntacode': 'NTAcode'
}, inplace=True) 

In [43]:
nyrp.shape

(56, 14)

In [45]:
gardens[gardens.NTA.notnull()].shape

(431, 17)

In [77]:
cols = ['Address', 'Latitude', 'Longitude', 'NTA']
gardens_combined = pd.concat([nyrp[cols], gardens[gardens.NTA.notnull()][cols]], ignore_index=True)
gardens_combined.head()

Unnamed: 0,Address,Latitude,Longitude,NTA
0,735 East 211th St,40.877499,-73.863489,Williamsbridge-Olinville
1,1818 Bathgate Ave,40.845051,-73.897747,Claremont-Bathgate
2,1017 Teller Ave,40.82815,-73.914356,East Concourse-Concourse Village
3,1328 Clay Ave,40.834934,-73.908892,East Concourse-Concourse Village
4,2044 Prospect Ave,40.845818,-73.887851,East Tremont


In [47]:
len(gardens_combined) == len(gardens[gardens.NTA.notnull()]) + len(nyrp)

True

Data cleaning: remove entries with same address

In [91]:
# find out how many duplicates there are
gardens_combined.Address.str.lower().value_counts().value_counts()
# 7 duplicates 

1    473
2      7
Name: Address, dtype: int64

In [90]:
# rows that have duplicates
gardens_combined[gardens_combined.duplicated(subset='Address', keep=False)]

Unnamed: 0,Address,Latitude,Longitude,NTA
144,3003 Seagirt Boulevard.,40.595964,-73.763133,Hammels-Arverne-Edgemere
161,429-433 East 117th Street,40.796357,-73.934119,East Harlem North
164,52 W 129th Street,40.809753,-73.94231,Central Harlem North-Polo Grounds
168,624-638 East 138th Street,40.806094,-73.915774,Mott Haven-Port Morris
169,624-638 East 138th Street,40.806094,-73.915774,Mott Haven-Port Morris
190,2592-2597 Bainbridge Avenue,40.863711,-73.892599,Bedford Park-Fordham North
191,2592-2597 Bainbridge Avenue,40.863711,-73.892599,Bedford Park-Fordham North
223,52 W 129th Street,40.809753,-73.94231,Central Harlem North-Polo Grounds
232,415-421 East 117th Street,40.79645,-73.934347,East Harlem North
321,953 Gates Avenue,40.688836,-73.92789,Stuyvesant Heights


In [95]:
gardens_combined.drop_duplicates(subset='Address', keep='first', inplace=True)
gardens_combined.shape

(480, 4)

In [96]:
# save dataset
gardens_combined.to_csv('../data/NYC_Community_Gardens_combined.csv')

In [None]:
# TODO: geoencode addresses to NTAs
# TODO: concat open data dataset to nyrp dataset

In [27]:
# i wanna remove rows that don't have address information
gardens[gardens.Address.isnull()]

Unnamed: 0,PropID,Boro,Community Board,Council District,Garden Name,Address,Size,Jurisdiction,NeighborhoodName,Cross Streets,Latitude,Longitude,Postcode,Census Tract,BIN,BBL,NTA
150,,B,B05,42.0,Euclid Pine Block Association,,0.071,TPL,East New York,Aldride & Bay Vew,,,,,,,


In [30]:
gardens.dropna(subset=['Address'], inplace=True)

In [32]:
len(gardens[gardens.NTA.isnull()]) # i need to geocode at least 104 records

104

In [46]:
rows_to_geocode = gardens.NTA.isnull()
rows_to_geocode.head()

0    False
1     True
2    False
3    False
4    False
Name: NTA, dtype: bool

In [54]:
gardens_geocode = gardens[rows_to_geocode]
gardens_geocode.head()

Unnamed: 0,PropID,Boro,Community Board,Council District,Garden Name,Address,Size,Jurisdiction,NeighborhoodName,Cross Streets,Latitude,Longitude,Postcode,Census Tract,BIN,BBL,NTA
1,,B,B08,36.0,1100 Bergen Street Community Garden,1101 & 1105 Bergen Street,0.207,PRI,Crown Heights,Nostrand & New York Avenues,,,,,,,
5,,X,X12,15.0,211th Street Block Association.,Carlisle Place,0.182,NYRP,,At E. 211th Street,,,,,,,
7,B535,B,B05,42.0,400 Montauk Avenue Block Association. (Ismael ...,New Lots Avenue,0.091,DPR,East New York,Active,,,,,,,
10,,B,B07,0.0,64th Street Community Garden,64th Street,,DOT,Sunset Park,West of 4th Avenue,,,,,,,
17,B507,B,B03,41.0,A Better Community Garden,762-764 Herkimer Place/13-21 Hunterfly,0.044,DPR,Bed-Stuy,Active,,,,,,,


In [95]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

locator = Nominatim(user_agent="myGeocoder",format_string="%s, NY")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1) # limit the rate at which i'm making api requests, so i don't get a service timeout error


In [96]:
gardens_geocode['geo_obj'] = gardens_geocode.Address.apply(geocode)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [77]:
# extract lat, long
gardens_geocode['point'] = gardens_geocode['geo_obj'].apply(lambda loc: tuple(loc.point[:2]) if loc else None)
gardens_geocode.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,PropID,Boro,Community Board,Council District,Garden Name,Address,Size,Jurisdiction,NeighborhoodName,Cross Streets,Latitude,Longitude,Postcode,Census Tract,BIN,BBL,NTA,geo_obj,point,location
1,,B,B08,36.0,1100 Bergen Street Community Garden,1101 & 1105 Bergen Street,0.207,PRI,Crown Heights,Nostrand & New York Avenues,,,,,,,,"(1101, Bergen Street, Newark, Essex County, Ne...","(40.706084, -74.210105)","(1100 Bergen Street Community Garden, Bergen S..."
5,,X,X12,15.0,211th Street Block Association.,Carlisle Place,0.182,NYRP,,At E. 211th Street,,,,,,,,"(Carlisle Place, Northbrook Park, Spartanburg ...","(35.05312485, -81.9415698018005)",
7,B535,B,B05,42.0,400 Montauk Avenue Block Association. (Ismael ...,New Lots Avenue,0.091,DPR,East New York,Active,,,,,,,,"(New Lots Avenue, Van Sinderen Avenue, Brownsv...","(40.6589611, -73.8992787)",
10,,B,B07,0.0,64th Street Community Garden,64th Street,,DOT,Sunset Park,West of 4th Avenue,,,,,,,,"(64th Street, uMngeni Ward 8, uMgeni Local Mun...","(-29.5704293, 30.1872218)",
17,B507,B,B03,41.0,A Better Community Garden,762-764 Herkimer Place/13-21 Hunterfly,0.044,DPR,Bed-Stuy,Active,,,,,,,,,,


In [81]:
unsuccessful_gardens = gardens_geocode[gardens_geocode.point.isnull()]
unsuccessful_gardens.shape
# 43 gardens whose addresses couldn't be geocoded

(43, 20)

In [82]:
gardens_geocode.dropna(subset=['point'], inplace=True)
gardens_geocode.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


(61, 20)

In [92]:
# spatial join w nta
# first, convert to dataframe
from shapely.geometry import Point
import geopandas as gpd

gardens_geocode['geometry'] = gardens_geocode.point.apply(Point)
gardens_geocode_gdf = gpd.GeoDataFrame(gardens_geocode, geometry='geometry')
gardens_geocode_gdf.head()


# set projection for gardens to a compatible one
# gardens_geocode_gdf.crs = nta.crs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,PropID,Boro,Community Board,Council District,Garden Name,Address,Size,Jurisdiction,NeighborhoodName,Cross Streets,...,Longitude,Postcode,Census Tract,BIN,BBL,NTA,geo_obj,point,location,geometry
1,,B,B08,36.0,1100 Bergen Street Community Garden,1101 & 1105 Bergen Street,0.207,PRI,Crown Heights,Nostrand & New York Avenues,...,,,,,,,"(1101, Bergen Street, Newark, Essex County, Ne...","(40.706084, -74.210105)","(1100 Bergen Street Community Garden, Bergen S...",POINT (40.706084 -74.210105)
5,,X,X12,15.0,211th Street Block Association.,Carlisle Place,0.182,NYRP,,At E. 211th Street,...,,,,,,,"(Carlisle Place, Northbrook Park, Spartanburg ...","(35.05312485, -81.9415698018005)",,POINT (35.05312485 -81.9415698018005)
7,B535,B,B05,42.0,400 Montauk Avenue Block Association. (Ismael ...,New Lots Avenue,0.091,DPR,East New York,Active,...,,,,,,,"(New Lots Avenue, Van Sinderen Avenue, Brownsv...","(40.6589611, -73.8992787)",,POINT (40.6589611 -73.8992787)
10,,B,B07,0.0,64th Street Community Garden,64th Street,,DOT,Sunset Park,West of 4th Avenue,...,,,,,,,"(64th Street, uMngeni Ward 8, uMgeni Local Mun...","(-29.5704293, 30.1872218)",,POINT (-29.5704293 30.1872218)
19,,B,B04,37.0,Aberdeen Street II Garden Projects,Aberdeen Street,0.253,NYRP,,,...,,,,,,,"(Aberdeen Street, Lee Gardens, Overland Park, ...","(38.952041, -94.621602)",,POINT (38.952041 -94.621602)
