# Data section

##### We extracted the ZIP Code Definitions of New York City Neighborhoods which is available from https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm. Created a csv file and uploaded to own server. To explore and target recommended locations across different venues according to the presence of amenities and essential facilities, we will access data through FourSquare API interface and arrange them as a dataframe for visualization. By merging data on New York City Zip Codes by neighborhood and data on venues and essential facilities surrounding such properties from FourSquare API interface.

# Methodology


##### Collect Data
##### Explore and Understand Data
##### Data Preparation and Preprocessing
##### Modeling

### Collect Data

In [46]:
# Scrape the website and store in data frame
import pandas as pd
link = "https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm"
df = pd.read_html(link,header=0)[0]


In [47]:
import requests
from bs4 import BeautifulSoup


url = "https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm"
wiki = requests.get(url)
soup = BeautifulSoup(wiki.content, "html.parser")
table = soup.find_all("table")[0]
table_rows = table.find_all("tr")

In [48]:
#reading into list of columns

c1=[]
c2=[]
c3=[]


for tr in table_rows:
    #header = tr.find_all("th")
    row = tr.find_all("td")
    if len(row) == 3:
        c1.append(row[0].find(text = True))
        c2.append(row[1].find(text = True))
        c3.append(row[2].find(text = True))

In [49]:
#reading into Data Frame

df = pd.DataFrame(c1, columns = ["Postcode"])
df["Borough"] = c2
df["Neighborhood"] = c3

In [50]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,Bronx,Central Bronx,"10453, 10457, 10460"
1,Brooklyn,Central Brooklyn,"11212, 11213, 11216, 11233, 11238"
2,Manhattan,Central Harlem,"10026, 10027, 10030, 10037, 10039"
3,Queens,Northeast Queens,"11361, 11362, 11363, 11364"
4,Staten Island,Port Richmond,"10302, 10303, 10310"


In [51]:
df.shape

(5, 3)

In [52]:
df.to_csv("NYZip2.csv")

In [53]:
df= pd.read_csv("NYZip2.csv", index_col=0)

df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,Bronx,Central Bronx,"10453, 10457, 10460"
1,Brooklyn,Central Brooklyn,"11212, 11213, 11216, 11233, 11238"
2,Manhattan,Central Harlem,"10026, 10027, 10030, 10037, 10039"
3,Queens,Northeast Queens,"11361, 11362, 11363, 11364"
4,Staten Island,Port Richmond,"10302, 10303, 10310"


In [54]:
#import libraries
import numpy as np
import pandas as pd

final= pd.read_csv("https://alsantiago.com/NYZip2.csv", index_col=0)

In [55]:
# import JSON library 
import json 

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
# library to handle requests

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [56]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata: done
Solving environment: done

# All requested packages already installed.

Libraries imported.


In [57]:
final = final[['Borough','Neighborhood','Latitude','Longitude']]
final.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Bronx,Central Bronx,40.852779,-73.912332
2,Bronx,Bronx Park and Fordham,40.862543,-73.888143
3,Bronx,High Bridge and Morrisania,40.820479,-73.925084
4,Bronx,Hunts Point and Mott Haven,40.805489,-73.916585
5,Bronx,Kingsbridge and Riverdale,40.880678,-73.90654


In [58]:
address = 'New York, NY'
geolocator = Nominatim(user_agent="on_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York are 40.7308619, -73.9871558.


In [59]:
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(final['Latitude'], final['Longitude'], final['Borough'], final['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [60]:
newyork_data = final[final['Borough'].str.contains('Queens', regex=True)]
# Dataframe where the Borough name contains "Brooklyn" word
downtown_newyork = newyork_data[newyork_data['Borough'] == 'Queens'].reset_index(drop=True)


In [61]:
address = 'New York, NY'
geolocator = Nominatim(user_agent="on_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 40.7308619, -73.9871558.


In [62]:
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(downtown_newyork['Latitude'], downtown_newyork['Longitude'], downtown_newyork['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

In [63]:
CLIENT_ID = '4VVLBSKBMVZQBCAKXVDPHCLEILZ55IQLJV5VVDK4DODH3O5N' # your Foursquare ID
CLIENT_SECRET = '3NV0AOGW4NGW0SPJ3DUE3D4YYOIEDODMQ4SUTYOCQ2UMKLG4' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 1000 # define radius

In [64]:
downtown_newyork.loc[0, 'Neighborhood']

'Northeast Queens'

In [65]:
# neighborhood latitude value
neighborhood_latitude = downtown_newyork.loc[0, 'Latitude']
# neighborhood longitude value
neighborhood_longitude = downtown_newyork.loc[0, 'Longitude']
# neighborhood name
neighborhood_name = downtown_newyork.loc[0, 'Neighborhood']

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))


Latitude and longitude values of Northeast Queens are 40.764191, -73.772775.


In [66]:
# limit of number of venues returned by Foursquare API
LIMIT = 100
# define radius
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
# display URL
url

'https://api.foursquare.com/v2/venues/explore?&client_id=4VVLBSKBMVZQBCAKXVDPHCLEILZ55IQLJV5VVDK4DODH3O5N&client_secret=3NV0AOGW4NGW0SPJ3DUE3D4YYOIEDODMQ4SUTYOCQ2UMKLG4&v=20180605&ll=40.764191,-73.772775&radius=500&limit=100'

In [67]:
# Send the GET request and examine the resutls
results = requests.get(url).json()

In [68]:
# Now extract the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [69]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,The French Workshop,Bakery,40.765404,-73.771861
1,Martha's Country Bakery,Bakery,40.763422,-73.770971
2,Press 195,Bar,40.763905,-73.770946
3,Avli Little Greek Tavern,Greek Restaurant,40.765729,-73.771972
4,Nippon Cha,Noodle House,40.764408,-73.771461


In [70]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

76 venues were returned by Foursquare.


In [71]:
# Let's create a function to repeat the same process to all the neighborhoods in downtown_newyork
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [72]:
downtown_newyork_venues = getNearbyVenues(names=downtown_newyork['Neighborhood'],
                                   latitudes=downtown_newyork['Latitude'],
                                   longitudes=downtown_newyork['Longitude']
                                  )

Northeast Queens
North Queens
Central Queens
Jamaica
Northwest Queens
West Central Queens
Rockaways
Southeast Queens
Southwest Queens
West Queens


In [73]:
print(downtown_newyork_venues.shape)
downtown_newyork_venues.head()

(313, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Northeast Queens,40.764191,-73.772775,The French Workshop,40.765404,-73.771861,Bakery
1,Northeast Queens,40.764191,-73.772775,Martha's Country Bakery,40.763422,-73.770971,Bakery
2,Northeast Queens,40.764191,-73.772775,Press 195,40.763905,-73.770946,Bar
3,Northeast Queens,40.764191,-73.772775,Avli Little Greek Tavern,40.765729,-73.771972,Greek Restaurant
4,Northeast Queens,40.764191,-73.772775,Nippon Cha,40.764408,-73.771461,Noodle House


In [74]:
downtown_newyork_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Queens,22,22,22,22,22,22
Jamaica,9,9,9,9,9,9
North Queens,19,19,19,19,19,19
Northeast Queens,76,76,76,76,76,76
Northwest Queens,63,63,63,63,63,63
Rockaways,2,2,2,2,2,2
Southeast Queens,25,25,25,25,25,25
Southwest Queens,24,24,24,24,24,24
West Central Queens,38,38,38,38,38,38
West Queens,35,35,35,35,35,35


In [75]:
print('There are {} uniques categories.'.format(len(downtown_newyork_venues['Venue Category'].unique())))

There are 119 uniques categories.


In [76]:
# Analyze Each Neighborhood
# one hot encoding
downtown_onehot = pd.get_dummies(downtown_newyork_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_onehot['Neighborhood'] = downtown_newyork_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]

downtown_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Art Museum,Asian Restaurant,Athletics & Sports,Auto Workshop,Automotive Shop,Bagel Shop,Bakery,Bank,...,Toy / Game Store,Trail,Train,Train Station,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Northeast Queens,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Northeast Queens,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Northeast Queens,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Northeast Queens,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Northeast Queens,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
downtown_grouped = downtown_onehot.groupby('Neighborhood').mean().reset_index()
downtown_grouped

Unnamed: 0,Neighborhood,American Restaurant,Art Museum,Asian Restaurant,Athletics & Sports,Auto Workshop,Automotive Shop,Bagel Shop,Bakery,Bank,...,Toy / Game Store,Trail,Train,Train Station,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Central Queens,0.0,0.0,0.0,0.045455,0.045455,0.045455,0.045455,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Jamaica,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0
2,North Queens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,...,0.052632,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Northeast Queens,0.039474,0.0,0.013158,0.0,0.0,0.0,0.013158,0.026316,0.013158,...,0.0,0.013158,0.0,0.013158,0.0,0.0,0.0,0.013158,0.0,0.013158
4,Northwest Queens,0.031746,0.015873,0.0,0.0,0.0,0.0,0.0,0.0,0.015873,...,0.0,0.0,0.0,0.015873,0.0,0.0,0.015873,0.0,0.015873,0.0
5,Rockaways,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Southeast Queens,0.04,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Southwest Queens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0
8,West Central Queens,0.026316,0.0,0.0,0.0,0.0,0.0,0.026316,0.026316,0.026316,...,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0
9,West Queens,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in downtown_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = downtown_grouped[downtown_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Queens----
                venue  freq
0  Chinese Restaurant  0.09
1         Pizza Place  0.09
2            Bus Line  0.05
3      Ice Cream Shop  0.05
4          Food Truck  0.05


----Jamaica----
                  venue  freq
0    Chinese Restaurant  0.11
1    Seafood Restaurant  0.11
2              Pharmacy  0.11
3     Fish & Chips Shop  0.11
4  Caribbean Restaurant  0.11


----North Queens----
               venue  freq
0        Supermarket  0.16
1  Korean Restaurant  0.16
2   Toy / Game Store  0.05
3       Liquor Store  0.05
4   Sushi Restaurant  0.05


----Northeast Queens----
                 venue  freq
0                  Bar  0.08
1          Pizza Place  0.07
2  American Restaurant  0.04
3     Sushi Restaurant  0.04
4                  Pub  0.04


----Northwest Queens----
                venue  freq
0                Café  0.11
1               Hotel  0.08
2         Coffee Shop  0.08
3  Mexican Restaurant  0.06
4         Pizza Place  0.05


----Rockaways----
          

In [79]:
# put that into a pandas dataframe
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [80]:
# create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_grouped['Neighborhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Central Queens,Pizza Place,Chinese Restaurant,Martial Arts Dojo,Bubble Tea Shop,Deli / Bodega
1,Jamaica,Discount Store,Chinese Restaurant,Basketball Court,Seafood Restaurant,Caribbean Restaurant
2,North Queens,Korean Restaurant,Supermarket,Sushi Restaurant,Pool Hall,Deli / Bodega
3,Northeast Queens,Bar,Pizza Place,Indian Restaurant,Sushi Restaurant,Café
4,Northwest Queens,Café,Coffee Shop,Hotel,Mexican Restaurant,Bar
