In [1]:
import pandas as pd

In [2]:
# TODO: make a combined dataset for NYRP and NYC greenthumb

In [3]:
gardens = pd.read_csv('../data/NYC_Greenthumb_Community_Gardens.csv')
gardens.head(2)

# good to know that the dataset already has NTA name and borough

Unnamed: 0,PropID,Boro,Community Board,Council District,Garden Name,Address,Size,Jurisdiction,NeighborhoodName,Cross Streets,Latitude,Longitude,Postcode,Census Tract,BIN,BBL,NTA
0,,M,M03,2.0,11 BC Serenity Garden,626 East 11th Street,0.054,DPR,East Village,Avenues B & C,40.727124,-73.978677,10009.0,28.0,1000000.0,1003930020.0,Lower East Side ...
1,,B,B08,36.0,1100 Bergen Street Community Garden,1101 & 1105 Bergen Street,0.207,PRI,Crown Heights,Nostrand & New York Avenues,,,,,,,


Data cleaning 1: remove whitespace in NTA column

In [4]:
# before cleaning
gardens.NTA.unique()[:5] # check first 5 elements

array(['Lower East Side                                                            ',
       nan,
       'East Harlem South                                                          ',
       'East Harlem North                                                          ',
       'East Village                                                               '],
      dtype=object)

In [5]:
# remove trailing whitespace in NTA names
gardens['NTA'] = gardens.NTA.str.strip() 
gardens.NTA.unique()[:5] 

array(['Lower East Side', nan, 'East Harlem South', 'East Harlem North',
       'East Village'], dtype=object)

Data cleaning 2: replace boro column with actual names

In [6]:
# before cleaning
gardens.Boro.unique()

array(['M', 'B', 'X', 'Q', 'R'], dtype=object)

In [7]:
# replace boro initial with actual borough name
boroughs = {
    'M': 'Manhattan', 
    'B': 'Brooklyn',
    'Q': 'Queens',
    'X': 'Bronx',
    'R': 'Staten Island'
}

gardens['Boro'].replace(boroughs, inplace=True)
gardens.head(2)

Unnamed: 0,PropID,Boro,Community Board,Council District,Garden Name,Address,Size,Jurisdiction,NeighborhoodName,Cross Streets,Latitude,Longitude,Postcode,Census Tract,BIN,BBL,NTA
0,,Manhattan,M03,2.0,11 BC Serenity Garden,626 East 11th Street,0.054,DPR,East Village,Avenues B & C,40.727124,-73.978677,10009.0,28.0,1000000.0,1003930020.0,Lower East Side
1,,Brooklyn,B08,36.0,1100 Bergen Street Community Garden,1101 & 1105 Bergen Street,0.207,PRI,Crown Heights,Nostrand & New York Avenues,,,,,,,


Data cleaning 3: check if NTA column in this dataset is an accurate subset of NTA dataset

In [8]:
import geopandas as gpd
nta = gpd.read_file('../data/Neighborhood Tabulation Areas.geojson')
nta.head(2)

Unnamed: 0,ntacode,shape_area,county_fips,ntaname,shape_leng,boro_name,boro_code,geometry
0,BK88,54005018.7472,47,Borough Park,39247.2280737,Brooklyn,3,(POLYGON ((-73.97604935657381 40.6312759056467...
1,QN51,52488276.477,81,Murray Hill,33266.904811,Queens,4,(POLYGON ((-73.80379022888246 40.7756101117924...


In [9]:
nta.ntaname.unique()[:5] # check first 5 elements, no whitespace

array(['Borough Park', 'Murray Hill', 'East Elmhurst', 'Hollis',
       'Manhattanville'], dtype=object)

In [10]:
# not all NTAs in the community gardens dataset is inside the NTA dataset
garden_set = set(gardens.NTA.unique())
nta_set = set(nta.ntaname.unique())
garden_set.issubset(nta_set)

False

In [11]:
# find out which NTAs are in gardens but not in NTA...turns out to be relatively trivial
garden_set.difference(nta_set)

{'', nan}

In [12]:
# check how many records have nta == '' or nan
invalid_num = len(gardens[gardens.NTA == '']) + len(gardens[gardens.NTA.isnull()])
print(f"{invalid_num} data records with invalid NTAs")
print(f"that's {round(invalid_num/len(gardens), 2)*100}% of the dataset")

107 data records with invalid NTAs
that's 20.0% of the dataset


Data cleaning 4: fill in missing/null NTA info

In [13]:
# gardens to geocode
data = gardens[(gardens.Address.notnull()) & (gardens.NTA.isnull())]
data.shape

(104, 17)

In [14]:
data.head()

Unnamed: 0,PropID,Boro,Community Board,Council District,Garden Name,Address,Size,Jurisdiction,NeighborhoodName,Cross Streets,Latitude,Longitude,Postcode,Census Tract,BIN,BBL,NTA
1,,Brooklyn,B08,36.0,1100 Bergen Street Community Garden,1101 & 1105 Bergen Street,0.207,PRI,Crown Heights,Nostrand & New York Avenues,,,,,,,
5,,Bronx,X12,15.0,211th Street Block Association.,Carlisle Place,0.182,NYRP,,At E. 211th Street,,,,,,,
7,B535,Brooklyn,B05,42.0,400 Montauk Avenue Block Association. (Ismael ...,New Lots Avenue,0.091,DPR,East New York,Active,,,,,,,
10,,Brooklyn,B07,0.0,64th Street Community Garden,64th Street,,DOT,Sunset Park,West of 4th Avenue,,,,,,,
17,B507,Brooklyn,B03,41.0,A Better Community Garden,762-764 Herkimer Place/13-21 Hunterfly,0.044,DPR,Bed-Stuy,Active,,,,,,,


In [15]:
pd.DataFrame.apply?

In [16]:
data['Full_Address'] = data.Address + ", " + data.Boro + ", NY"
data = data[['Boro', 'Address', 'Full_Address', 'NTA']]
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Boro,Address,Full_Address,NTA
1,Brooklyn,1101 & 1105 Bergen Street,"1101 & 1105 Bergen Street, Brooklyn, NY",
5,Bronx,Carlisle Place,"Carlisle Place, Bronx, NY",
7,Brooklyn,New Lots Avenue,"New Lots Avenue, Brooklyn, NY",
10,Brooklyn,64th Street,"64th Street, Brooklyn, NY",
17,Brooklyn,762-764 Herkimer Place/13-21 Hunterfly,"762-764 Herkimer Place/13-21 Hunterfly, Brookl...",


In [17]:
# geocode using Full_Address

In [18]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1) # limit the rate at which i'm making api requests, so i don't get a service timeout error

In [20]:
data['Point'] = data.Full_Address.apply(geocode).apply(lambda loc: tuple([loc.point[1], loc.point[0]]) if loc else None)
data.head()

RateLimiter caught an error, retrying (0/2 tries). Called with (*('West 153rd Street, Manhattan, NY',), **{}).
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.f

Unnamed: 0,Boro,Address,Full_Address,NTA,Point
1,Brooklyn,1101 & 1105 Bergen Street,"1101 & 1105 Bergen Street, Brooklyn, NY",,"(-73.949439375, 40.67615225)"
5,Bronx,Carlisle Place,"Carlisle Place, Bronx, NY",,"(-73.863012, 40.878598)"
7,Brooklyn,New Lots Avenue,"New Lots Avenue, Brooklyn, NY",,"(-73.8992787, 40.6589611)"
10,Brooklyn,64th Street,"64th Street, Brooklyn, NY",,"(-74.0231939, 40.6400567)"
17,Brooklyn,762-764 Herkimer Place/13-21 Hunterfly,"762-764 Herkimer Place/13-21 Hunterfly, Brookl...",,


In [21]:
len(data[data.Point.isnull()]) # still 55 addresses that cant be geocoded

55

In [22]:
# check that those successfully geocoded are within new york
success = data[data.Point.notnull()]
success.head()

Unnamed: 0,Boro,Address,Full_Address,NTA,Point
1,Brooklyn,1101 & 1105 Bergen Street,"1101 & 1105 Bergen Street, Brooklyn, NY",,"(-73.949439375, 40.67615225)"
5,Bronx,Carlisle Place,"Carlisle Place, Bronx, NY",,"(-73.863012, 40.878598)"
7,Brooklyn,New Lots Avenue,"New Lots Avenue, Brooklyn, NY",,"(-73.8992787, 40.6589611)"
10,Brooklyn,64th Street,"64th Street, Brooklyn, NY",,"(-74.0231939, 40.6400567)"
19,Brooklyn,Aberdeen Street,"Aberdeen Street, Brooklyn, NY",,"(-73.9055714, 40.6824518)"


In [23]:
from shapely.geometry import Point
import geopandas as gpd

def make_gdf(df, point_col_name='Point'): # eg data.Point
    df['geometry'] = df[point_col_name].apply(Point)
    gdf = gpd.GeoDataFrame(df, geometry='geometry')
    return gdf

In [24]:
# spatial join w nta
# first, convert to dataframe

gdf = make_gdf(data[data.Point.notnull()])
gdf.head()

# set projection for gardens to a compatible one
# gardens_geocode_gdf.crs = nta.crs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,Boro,Address,Full_Address,NTA,Point,geometry
1,Brooklyn,1101 & 1105 Bergen Street,"1101 & 1105 Bergen Street, Brooklyn, NY",,"(-73.949439375, 40.67615225)",POINT (-73.949439375 40.67615225)
5,Bronx,Carlisle Place,"Carlisle Place, Bronx, NY",,"(-73.863012, 40.878598)",POINT (-73.863012 40.878598)
7,Brooklyn,New Lots Avenue,"New Lots Avenue, Brooklyn, NY",,"(-73.8992787, 40.6589611)",POINT (-73.8992787 40.6589611)
10,Brooklyn,64th Street,"64th Street, Brooklyn, NY",,"(-74.0231939, 40.6400567)",POINT (-74.0231939 40.6400567)
19,Brooklyn,Aberdeen Street,"Aberdeen Street, Brooklyn, NY",,"(-73.9055714, 40.6824518)",POINT (-73.9055714 40.6824518)


In [25]:
def join_nta(nta, gdf_to_join):
    gdf_to_join.crs = nta.crs
    return gpd.sjoin(gdf_to_join, nta, op='intersects')

In [26]:
joined = join_nta(nta, gdf)
joined.head()

Unnamed: 0,Boro,Address,Full_Address,NTA,Point,geometry,index_right,ntacode,shape_area,county_fips,ntaname,shape_leng,boro_name,boro_code
1,Brooklyn,1101 & 1105 Bergen Street,"1101 & 1105 Bergen Street, Brooklyn, NY",,"(-73.949439375, 40.67615225)",POINT (-73.949439375 40.67615225),30,BK61,51619074.188,47,Crown Heights North,35635.5428561,Brooklyn,3
5,Bronx,Carlisle Place,"Carlisle Place, Bronx, NY",,"(-73.863012, 40.878598)",POINT (-73.863012 40.878598),75,BX44,36273595.3423,5,Williamsbridge-Olinville,27351.0773787,Bronx,2
7,Brooklyn,New Lots Avenue,"New Lots Avenue, Brooklyn, NY",,"(-73.8992787, 40.6589611)",POINT (-73.8992787 40.6589611),80,BK81,32719366.0874,47,Brownsville,27298.8512441,Brooklyn,3
208,Brooklyn,New Lots Avenue,"New Lots Avenue, Brooklyn, NY",,"(-73.8992787, 40.6589611)",POINT (-73.8992787 40.6589611),80,BK81,32719366.0874,47,Brownsville,27298.8512441,Brooklyn,3
431,Brooklyn,Powell Street,"Powell Street, Brooklyn, NY",,"(-73.90319, 40.666592)",POINT (-73.90319 40.666592),80,BK81,32719366.0874,47,Brownsville,27298.8512441,Brooklyn,3


In [30]:
joined.ntaname.isnull().value_counts() # yay all were spatially joined

False    49
Name: ntaname, dtype: int64

In [31]:
joined['Latitude'] = joined.geometry.x
joined['Longitude'] = joined.geometry.y
joined.head() 

Unnamed: 0,Boro,Address,Full_Address,NTA,Point,geometry,index_right,ntacode,shape_area,county_fips,ntaname,shape_leng,boro_name,boro_code,Latitude,Longitude
1,Brooklyn,1101 & 1105 Bergen Street,"1101 & 1105 Bergen Street, Brooklyn, NY",,"(-73.949439375, 40.67615225)",POINT (-73.949439375 40.67615225),30,BK61,51619074.188,47,Crown Heights North,35635.5428561,Brooklyn,3,-73.949439,40.676152
5,Bronx,Carlisle Place,"Carlisle Place, Bronx, NY",,"(-73.863012, 40.878598)",POINT (-73.863012 40.878598),75,BX44,36273595.3423,5,Williamsbridge-Olinville,27351.0773787,Bronx,2,-73.863012,40.878598
7,Brooklyn,New Lots Avenue,"New Lots Avenue, Brooklyn, NY",,"(-73.8992787, 40.6589611)",POINT (-73.8992787 40.6589611),80,BK81,32719366.0874,47,Brownsville,27298.8512441,Brooklyn,3,-73.899279,40.658961
208,Brooklyn,New Lots Avenue,"New Lots Avenue, Brooklyn, NY",,"(-73.8992787, 40.6589611)",POINT (-73.8992787 40.6589611),80,BK81,32719366.0874,47,Brownsville,27298.8512441,Brooklyn,3,-73.899279,40.658961
431,Brooklyn,Powell Street,"Powell Street, Brooklyn, NY",,"(-73.90319, 40.666592)",POINT (-73.90319 40.666592),80,BK81,32719366.0874,47,Brownsville,27298.8512441,Brooklyn,3,-73.90319,40.666592


In [33]:
joined = joined[['Address', 'Latitude', 'Longitude', 'NTA']]
joined.head() # yay!

Unnamed: 0,Address,Latitude,Longitude,NTA
1,1101 & 1105 Bergen Street,-73.949439,40.676152,
5,Carlisle Place,-73.863012,40.878598,
7,New Lots Avenue,-73.899279,40.658961,
208,New Lots Avenue,-73.899279,40.658961,
431,Powell Street,-73.90319,40.666592,


Investigate the cases where geocoding failed

In [53]:
# check that those successfully geocoded are within new york
failures = data[data.Point.isnull()]
failures.head()

Unnamed: 0,Boro,Address,Full_Address,NTA,Point
17,Brooklyn,762-764 Herkimer Place/13-21 Hunterfly,"762-764 Herkimer Place/13-21 Hunterfly, Brookl...",,
21,Bronx,1980 Lafayette Avenue/Stickball Avenue,"1980 Lafayette Avenue/Stickball Avenue, Bronx, NY",,
30,Queens,35th St. Bet. 35th & 36th Aves,"35th St. Bet. 35th & 36th Aves, Queens, NY",,
41,Brooklyn,93-95 Malcolm X Avenue,"93-95 Malcolm X Avenue, Brooklyn, NY",,
47,Bronx,in Bissel Street,"in Bissel Street, Bronx, NY",,


In [54]:
failures.shape

(55, 5)

In [55]:
failures.to_csv('failures.csv')

In [62]:
failures = pd.read_csv('failures_cleaned.csv', index_col=0)
failures.head()

Unnamed: 0,Boro,Address,Full_Address,NTA,Point
17,Brooklyn,762-764 Herkimer Place/13-21 Hunterfly,"762-764 Herkimer Place/13-21 Hunterfly, Brookl...",,"40.678030, -73.927007"
21,Bronx,1980 Lafayette Avenue/Stickball Avenue,"1980 Lafayette Avenue/Stickball Avenue, Bronx, NY",,"40.821774, -73.854449"
30,Queens,35th St. Bet. 35th & 36th Aves,"35th St. Bet. 35th & 36th Aves, Queens, NY",,"40.756476, -73.923622"
41,Brooklyn,93-95 Malcolm X Avenue,"93-95 Malcolm X Avenue, Brooklyn, NY",,"40.690746, -73.929372"
101,Manhattan,320 96th Street NY NY 10128,320 96th Street NY NY 10128,,"40.783460, -73.945095"


In [66]:
def split_pt(s):
    temp = s.split(', ')
    return tuple([float(temp[1]), float(temp[0])]) # long, lat

failures['Point2'] = failures.Point.apply(split_pt)
failures.head()

Unnamed: 0,Boro,Address,Full_Address,NTA,Point,Point2
17,Brooklyn,762-764 Herkimer Place/13-21 Hunterfly,"762-764 Herkimer Place/13-21 Hunterfly, Brookl...",,"40.678030, -73.927007","(-73.927007, 40.67803)"
21,Bronx,1980 Lafayette Avenue/Stickball Avenue,"1980 Lafayette Avenue/Stickball Avenue, Bronx, NY",,"40.821774, -73.854449","(-73.854449, 40.821774)"
30,Queens,35th St. Bet. 35th & 36th Aves,"35th St. Bet. 35th & 36th Aves, Queens, NY",,"40.756476, -73.923622","(-73.923622, 40.756476)"
41,Brooklyn,93-95 Malcolm X Avenue,"93-95 Malcolm X Avenue, Brooklyn, NY",,"40.690746, -73.929372","(-73.929372, 40.690746)"
101,Manhattan,320 96th Street NY NY 10128,320 96th Street NY NY 10128,,"40.783460, -73.945095","(-73.945095, 40.78346)"


In [67]:
gdf = make_gdf(failures, 'Point2')
gdf.head()

Unnamed: 0,Boro,Address,Full_Address,NTA,Point,Point2,geometry
17,Brooklyn,762-764 Herkimer Place/13-21 Hunterfly,"762-764 Herkimer Place/13-21 Hunterfly, Brookl...",,"40.678030, -73.927007","(-73.927007, 40.67803)",POINT (-73.927007 40.67803)
21,Bronx,1980 Lafayette Avenue/Stickball Avenue,"1980 Lafayette Avenue/Stickball Avenue, Bronx, NY",,"40.821774, -73.854449","(-73.854449, 40.821774)",POINT (-73.854449 40.821774)
30,Queens,35th St. Bet. 35th & 36th Aves,"35th St. Bet. 35th & 36th Aves, Queens, NY",,"40.756476, -73.923622","(-73.923622, 40.756476)",POINT (-73.92362199999999 40.756476)
41,Brooklyn,93-95 Malcolm X Avenue,"93-95 Malcolm X Avenue, Brooklyn, NY",,"40.690746, -73.929372","(-73.929372, 40.690746)",POINT (-73.929372 40.690746)
101,Manhattan,320 96th Street NY NY 10128,320 96th Street NY NY 10128,,"40.783460, -73.945095","(-73.945095, 40.78346)",POINT (-73.94509499999999 40.78346)


In [73]:
joined2 = join_nta(nta, gdf)
joined2.head()

Unnamed: 0,Boro,Address,Full_Address,NTA,Point,Point2,geometry,index_right,ntacode,shape_area,county_fips,ntaname,shape_leng,boro_name,boro_code
17,Brooklyn,762-764 Herkimer Place/13-21 Hunterfly,"762-764 Herkimer Place/13-21 Hunterfly, Brookl...",,"40.678030, -73.927007","(-73.927007, 40.67803)",POINT (-73.927007 40.67803),30,BK61,51619074.188,47,Crown Heights North,35635.5428561,Brooklyn,3
133,Brooklyn,"Eastern Pkwy.,W/O Franklin","Eastern Pkwy.,W/O Franklin, Brooklyn, NY",,"40.670787, -73.954455","(-73.954455, 40.670787)",POINT (-73.954455 40.670787),30,BK61,51619074.188,47,Crown Heights North,35635.5428561,Brooklyn,3
426,Brooklyn,Hunterfly Place and Atlantic Avenue,"Hunterfly Place and Atlantic Avenue, Brooklyn, NY",,"40.677592, -73.927609","(-73.927609, 40.677592)",POINT (-73.927609 40.677592),30,BK61,51619074.188,47,Crown Heights North,35635.5428561,Brooklyn,3
21,Bronx,1980 Lafayette Avenue/Stickball Avenue,"1980 Lafayette Avenue/Stickball Avenue, Bronx, NY",,"40.821774, -73.854449","(-73.854449, 40.821774)",POINT (-73.854449 40.821774),111,BX09,51983796.8159,5,Soundview-Castle Hill-Clason Point-Harding Park,67340.9802906,Bronx,2
30,Queens,35th St. Bet. 35th & 36th Aves,"35th St. Bet. 35th & 36th Aves, Queens, NY",,"40.756476, -73.923622","(-73.923622, 40.756476)",POINT (-73.92362199999999 40.756476),161,QN70,39335513.5655,81,Astoria,32534.7380486,Queens,4


In [74]:
joined2['Latitude'] = joined2.geometry.x
joined2['Longitude'] = joined2.geometry.y
joined2 = joined2[['Address', 'Latitude', 'Longitude', 'ntaname']]
joined2.head()

Unnamed: 0,Address,Latitude,Longitude,ntaname
17,762-764 Herkimer Place/13-21 Hunterfly,-73.927007,40.67803,Crown Heights North
133,"Eastern Pkwy.,W/O Franklin",-73.954455,40.670787,Crown Heights North
426,Hunterfly Place and Atlantic Avenue,-73.927609,40.677592,Crown Heights North
21,1980 Lafayette Avenue/Stickball Avenue,-73.854449,40.821774,Soundview-Castle Hill-Clason Point-Harding Park
30,35th St. Bet. 35th & 36th Aves,-73.923622,40.756476,Astoria


Data cleaning 5: combining all datasets

In [75]:
# dataset 1
nyrp = pd.read_csv('../data/NYRP_NTA.csv', index_col=0)
nyrp.head(2)

Unnamed: 0,address,coords,lat,long,borough,geometry,index_right,ntacode,shape_area,county_fips,ntaname,shape_leng,boro_name,boro_code
0,735 East 211th St,"40.877499,-73.863489",40.877499,-73.863489,Bronx,POINT (-73.863489 40.877499),75,BX44,36273600.0,5,Williamsbridge-Olinville,27351.077379,Bronx,2
1,1818 Bathgate Ave,"40.845051,-73.897747",40.845051,-73.897747,Bronx,POINT (-73.897747 40.845051),128,BX01,16451620.0,5,Claremont-Bathgate,29972.77772,Bronx,2


In [76]:
# rename columns for concat
nyrp.rename(columns={
    'address': 'Address',
    'lat': 'Latitude',
    'long': 'Longitude',
    'borough': 'Borough',
    'ntaname': 'NTA',
    'ntacode': 'NTAcode'
}, inplace=True) 

In [77]:
nyrp.shape

(56, 14)

In [78]:
cols = ['Address', 'Latitude', 'Longitude', 'NTA']
greenthumb = gardens[gardens.NTA.notnull()][cols] # dataset 2
greenthumb.shape

(431, 4)

In [79]:
greenthumb_geocoded = joined # dataset 3, gardens in greenthumb that didn't have NTA info initially, which i later geocoded
greenthumb_geocoded.shape

(49, 4)

In [80]:
greenthumb_geocoded2 = joined2
greenthumb_geocoded2.shape

(47, 4)

In [81]:
gardens_combined = pd.concat([nyrp[cols], greenthumb, greenthumb_geocoded, greenthumb_geocoded2], ignore_index=True)
gardens_combined.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,Address,Latitude,Longitude,NTA,ntaname
0,735 East 211th St,40.877499,-73.863489,Williamsbridge-Olinville,
1,1818 Bathgate Ave,40.845051,-73.897747,Claremont-Bathgate,
2,1017 Teller Ave,40.82815,-73.914356,East Concourse-Concourse Village,
3,1328 Clay Ave,40.834934,-73.908892,East Concourse-Concourse Village,
4,2044 Prospect Ave,40.845818,-73.887851,East Tremont,


In [82]:
gardens_combined.shape

(583, 5)

Data cleaning: remove entries with same address

In [83]:
# find out how many duplicates there are
gardens_combined.Address.str.lower().value_counts().value_counts()
# 8 duplicates 

1    567
2      8
Name: Address, dtype: int64

In [84]:
# rows that have duplicates
gardens_combined[gardens_combined.duplicated(subset='Address', keep=False)]

Unnamed: 0,Address,Latitude,Longitude,NTA,ntaname
144,3003 Seagirt Boulevard.,40.595964,-73.763133,Hammels-Arverne-Edgemere,
161,429-433 East 117th Street,40.796357,-73.934119,East Harlem North,
164,52 W 129th Street,40.809753,-73.94231,Central Harlem North-Polo Grounds,
168,624-638 East 138th Street,40.806094,-73.915774,Mott Haven-Port Morris,
169,624-638 East 138th Street,40.806094,-73.915774,Mott Haven-Port Morris,
190,2592-2597 Bainbridge Avenue,40.863711,-73.892599,Bedford Park-Fordham North,
191,2592-2597 Bainbridge Avenue,40.863711,-73.892599,Bedford Park-Fordham North,
223,52 W 129th Street,40.809753,-73.94231,Central Harlem North-Polo Grounds,
232,415-421 East 117th Street,40.79645,-73.934347,East Harlem North,
321,953 Gates Avenue,40.688836,-73.92789,Stuyvesant Heights,


In [85]:
gardens_combined.drop_duplicates(subset='Address', keep='first', inplace=True)
gardens_combined.shape

(575, 5)

In [86]:
# save dataset
gardens_combined.to_csv('../data/NYC_Community_Gardens_combined.csv')