# The Battle of the Neighbourhoods - W2


### Importing libraries

In [70]:
import numpy as np
import pandas as pd
import json
from pandas.io.json import json_normalize

import csv
import requests

import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker

import seaborn as sns

from bs4 import BeautifulSoup

from geopy.geocoders import Nominatim
import folium

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

### Loading neighbourhood data

In [71]:
opendata= requests.get('https://data.cityofnewyork.us/api/geospatial/99bc-9p23?method=export&format=GeoJSON')
x= opendata.json()
neighdata=x['features']

In [72]:
neighdata[0]

{'type': 'Feature',
 'properties': {'stacked': '1',
  'name': 'Wakefield',
  'annoline1': 'Wakefield',
  'annoline3': '',
  'objectid': '1',
  'annoangle': '0.0',
  'annoline2': '',
  'borough': 'Bronx'},
 'geometry': {'type': 'Point',
  'coordinates': [-73.8472005205491, 40.89470517661004]}}

In [73]:
column_names = ['Borough', 'Neighbourhood', 'Latitude', 'Longitude'] 
neighdf = pd.DataFrame(columns=column_names)

In [74]:
for data in neighdata:
    borough = neighname = data['properties']['borough'] 
    neighname = data['properties']['name']   
    neighbourhood_latlon = data['geometry']['coordinates']
    neighbourhood_lat = neighbourhood_latlon[1]
    neighbourhood_lon = neighbourhood_latlon[0]
    neighdf = neighdf.append({'Borough': borough,'Neighbourhood': neighname, 'Latitude': neighbourhood_lat, 'Longitude': neighbourhood_lon}, ignore_index=True)

In [75]:
neighdf.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


### Superimposing these values on a map of New York

In [76]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [77]:
NY_map = folium.Map(location=[latitude, longitude], zoom_start=10.3)

for lat, lng, borough, neighbourhood in zip(neighdf['Latitude'], neighdf['Longitude'], neighdf['Borough'], neighdf['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(NY_map)  
    

In [78]:
NY_map

### Web-scraping and cleaning demographic data

No. of Indian-Americans segmented borough wise

In [79]:
url = requests.get('https://en.wikipedia.org/wiki/Indians_in_the_New_York_City_metropolitan_region').text
soup = BeautifulSoup(url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})

headers = [header.text for header in table.find_all('th')]
table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)
with open('DEMO.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

demo_df=pd.read_csv('DEMO.csv')

# Cleaning the dataset

demo_df=demo_df.replace(to_replace='\r\n', value='', regex=True)
demo_df=demo_df.rename(columns={'Borough\r\n':'Borough', 'Rank\r\n':'Rank', 'City\r\n':'City','Indian Americans\r\n':'Indian Americans','Density of Indian Americans per square mile\r\n':'(Indian Americans)/sq_mi', "Percentage of Indian Americans in municipality's population\r\n":'% of Indian Americans'})
demo_df=demo_df.drop(columns=['City'])
demo_df=demo_df.drop(5, axis=0)
demo_df['Borough']=['Queens', 'Brooklyn', 'Manhattan', 'The Bronx', 'Staten Island']
demo_df=demo_df.set_index(['Rank'])

demo_df.head(10)

Unnamed: 0_level_0,Borough,Indian Americans,(Indian Americans)/sq_mi,% of Indian Americans
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Queens,144896,1326.5,6.2
2,Brooklyn,25270,357.9,1.0
3,Manhattan,24359,1060.9,1.5
4,The Bronx,16748,398.6,1.2
5,Staten Island,6646,113.6,1.4


### Web-scraping and cleaning population data

In [80]:
# Scraping table from wikipedia page
url1 = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup1 = BeautifulSoup(url1,'lxml')
table = soup1.find('table',{'class':'wikitable sortable'})
headers = [header.text for header in table.find_all('th')]
table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

# Writing table to csv file
with open('POP.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

In [81]:
# Dropping irrelevant columns and renaming the relavant ones
pop_df=pd.read_csv('POP.csv') 
pop_df.drop(pop_df.columns[[3,4,9,10,11,12,13,14]], axis=1,inplace=True)
pop_df.rename(columns={"New York City's five boroughsvte\r\n":'Borough','Jurisdiction\r\n':'County','County':'Persons/sq_mi','Estimate (2017)[12]':'Persons/sq_km', 'Borough':'Area sq_km','Population\r\n':'Population', 'Density\r\n':'Area sq_mi'}, inplace=True)

# Removing \r and \n
pop_df['Borough']=pop_df['Borough'].replace(to_replace='\r\n', value='', regex=True)
pop_df['County']=pop_df['County'].replace(to_replace='\r\n', value='', regex=True)
pop_df['Population']=pop_df['Population'].replace(to_replace='\r\n', value='', regex=True)
pop_df['Area sq_mi']=pop_df['Area sq_mi'].replace(to_replace='\r\n', value='', regex=True)
pop_df['Area sq_km']=pop_df['Area sq_km'].replace(to_replace='\r\n', value='', regex=True)
pop_df['Persons/sq_km']=pop_df['Persons/sq_km'].replace(to_replace='\r\n', value='', regex=True)
pop_df['Persons/sq_mi']=pop_df['Persons/sq_mi'].replace(to_replace='\r\n', value='', regex=True)

# Deleting last row, shifting 6th and 7th row by one cell to the right.
pop_df.loc[pop_df['Borough']=='City of New York'] = pop_df.loc[pop_df['Borough']=='City of New York'].shift(periods=1, axis=1)
pop_df.loc[pop_df['Borough']=='State of New York'] = pop_df.loc[pop_df['Borough']=='State of New York'].shift(periods=1, axis=1)
pop_df=pop_df.drop(7, axis=0)

pop_df = pop_df.fillna('')
pop_df.head(10)


Unnamed: 0,Borough,County,Population,Area sq_mi,Area sq_km,Persons/sq_mi,Persons/sq_km
0,The Bronx,Bronx,1471160,42.1,109.04,34653.0,13231
1,Brooklyn,Kings,2648771,70.82,183.42,37137.0,14649
2,Manhattan,New York,1664727,22.83,59.13,72033.0,27826
3,Queens,Queens,2358582,108.53,281.09,21460.0,8354
4,Staten Island,Richmond,479458,58.37,151.18,8112.0,3132
5,,City of New York,8622698,806.863,783.83,28188.0,10947
6,,State of New York,19849399,1547.116,122284.0,416.4,159


### Web-scraping and cleaning Cuisine data

In [82]:
url2=requests.get('https://en.wikipedia.org/wiki/Cuisine_of_New_York_City').text
soup2=BeautifulSoup(url2, 'html.parser')
results = {}
articles = soup2.find_all('div', class_ = "div-col columns column-width")
for article in articles:
    if article.find_previous_sibling('h2').find('span').get('id') == 'Enclaves_reflecting_national_cuisines':
        category = article.find_previous_sibling('h3')
        title_key = category.find('span',{'class':'mw-headline'}).get_text()
        if not title_key in results.keys():
            results[title_key] = []
        results[title_key] = article.text.split('\n')[1:len(article.text.split('\n'))-1]

# Cleaning the dataset

cuis_df=pd.DataFrame.from_dict(results, orient='index')
cuis_df=cuis_df.stack()
cuis_df=cuis_df.to_frame().reset_index(drop=True)
cuis_df=cuis_df[0].str.split('–',n=1, expand=True)
cuis_df.at[19,0] = 'The Rockaways'
cuis_df.at[19,1] = 'Irish, Jewish'
cuis_df.at[1,1] = 'Italian, Albanian'
cuis_df.at[4,1] = 'Filipino'
cuis_df.at[0,1] = 'Mexican, Puerto Rican, Dominican, Korean'
df1=pd.DataFrame({0:['Port Richmond','Rossville; South Beach; Great Kills','Tompkinsville'], 1:['Mexican, Indian, Italian', 'Italian, Russian, Arab and Polish','Italian, Sri Lankan, Pakistani, Indian']})
cuis_df=cuis_df.append(df1, ignore_index=True)
boro=['The Bronx','The Bronx','The Bronx','The Bronx','The Bronx','The Bronx','The Bronx','The Bronx','The Bronx','Queens','Queens','Queens','Queens','Queens','Queens','Queens','Queens','Queens','Queens','Queens','Queens','Queens','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Brooklyn','Manhattan','Manhattan','Manhattan','Manhattan','Manhattan','Manhattan','Manhattan','Manhattan','Manhattan','Manhattan','Manhattan','Manhattan','Staten Island','Staten Island','Staten Island']
cuis_df['Borough']=boro
cuis_df.columns=['Neighbourhood','Cuisine','Borough']
cols = cuis_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
cuis_df=cuis_df[cols]
cuis_df.head(10)

Unnamed: 0,Borough,Neighbourhood,Cuisine
0,The Bronx,Bedford Park,"Mexican, Puerto Rican, Dominican, Korean"
1,The Bronx,Belmont,"Italian, Albanian"
2,The Bronx,City Island,"Italian, Seafood"
3,The Bronx,Morris Park,"Italian, Albanian"
4,The Bronx,Norwood,Filipino
5,The Bronx,Riverdale,Jewish
6,The Bronx,South Bronx,"Puerto Rican, Dominican"
7,The Bronx,Wakefield,"Jamaican, West Indian"
8,The Bronx,Woodlawn,Irish
9,Queens,Astoria,"Greek, Italian, Eastern European, Brazilian, ..."


### Web-scraping and cleaning Farmers Market data

In [83]:
urlfm='https://data.cityofnewyork.us/api/views/8vwk-6iz2/rows.csv?accessType=DOWNLOAD'
fm_df=pd.read_csv(urlfm)
print(fm_df.shape)
fm_df.head()

(139, 13)


Unnamed: 0,Borough,Market Name,Street Address,Latitude,Longitude,Days of Operation,Hours of Operations,Season Dates,Accepts EBT,Open Year-Round,Stellar Cooking Demonstrations,Food Activities for Kids,Location Point
0,Brooklyn,Urban Oasis Farmers Market,681 Clarkson Ave,40.656255,-73.936608,Wednesday,2 - 5:30 p.m.,06/26/2019-11/06/2019,No,No,No,No,"(40.656255, -73.936608)"
1,Staten Island,Staten Island Mall Greenmarket,Marsh Ave & Ring Rd,40.583804,-74.161245,Saturday,8 a.m. - 3 p.m.,Year-Round,Yes,Yes,No,No,"(40.583804, -74.161245)"
2,Manhattan,Mount Sinai Hospital Greenmarket,E 99th St bet Madison & Park Aves,40.789169,-73.952743,Wednesday,8 a.m. - 5 p.m.,06/12/19-11/27/19,Yes,No,No,No,"(40.789169, -73.952743)"
3,Bronx,170 Farm Stand,E 170th St & Townsend Ave,40.839882,-73.916783,Wednesday,2:30 - 6:30 p.m.,07/10/2019-11/27/2019,Yes,No,No,Yes,"(40.839882, -73.916783)"
4,Manhattan,Grass Roots Farmers Market,"W 145th St bet Edgecombe & Bradhurst Aves, at ...",40.823647,-73.943844,Tuesday &\nSaturday,9 a.m. - 4 p.m.,07/11/2019-11/21/2019,Yes,No,No,No,"(40.823647, -73.943844)"


In [84]:
# Superimposing farmers market locations to map of New York
address = 'New York City, NY'
geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_fm = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, FacilityName, borough in zip(fm_df['Latitude'], fm_df['Longitude'], fm_df['Market Name'], fm_df['Borough']):
            label = '{}, {}'.format('Market Name', 'Borough')
            label = folium.Popup(label, parse_html=True)
            folium.CircleMarker(
                [lat, lng],
                radius=5,
                popup=label,
                color='green',
                fill=True,
                fill_color='green',
                fill_opacity=0.7,
                parse_html = False).add_to(map_fm)  

map_fm


### Using Foursquare API to get Venue related data in each borough

In [85]:
# Creating a dataframe containing info on Manhattan specifically

queens_df=neighdf[neighdf['Borough']=='Queens'].reset_index(drop=True)
queens_df

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Queens,Astoria,40.768509,-73.915654
1,Queens,Woodside,40.746349,-73.901842
2,Queens,Jackson Heights,40.751981,-73.882821
3,Queens,Elmhurst,40.744049,-73.881656
4,Queens,Howard Beach,40.654225,-73.838138
5,Queens,South Corona,40.742382,-73.856825
6,Queens,Forest Hills,40.725264,-73.844475
7,Queens,Kew Gardens,40.705179,-73.829819
8,Queens,Richmond Hill,40.697947,-73.831833
9,Queens,Downtown Flushing,40.761164,-73.829368


In [86]:
CLIENT_ID = 'HNFS3RJEB0WJDWU5WS05QOHLYIKAEFFCN0PZ2JTAOBV3DH2Q' 
CLIENT_SECRET = '1YBRZTMXEYCFOL0KEVY5TJOC0WBX2EX2XMPM5NZJZPVYR2Q1'
VERSION = '20181218'

def getNearbyVenues(names, latitudes, longitudes, LIMIT=200, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

NYC_venues = getNearbyVenues(names=queens_df['Neighbourhood'],
                                  latitudes=queens_df['Latitude'],
                                  longitudes=queens_df['Longitude'],
                                  LIMIT=200)

NYC_venues.to_csv('NYC_venues.csv', sep=',', encoding='UTF8')
NYC_venues.head()

Astoria
Woodside
Jackson Heights
Elmhurst
Howard Beach
South Corona
Forest Hills
Kew Gardens
Richmond Hill
Downtown Flushing
Long Island City
Sunnyside
East Elmhurst
Maspeth
Ridgewood
Glendale
Rego Park
Woodhaven
Ozone Park
South Ozone Park
College Point
Whitestone
Bayside
Auburndale
Little Neck
Douglaston
Glen Oaks
Bellerose
Kew Gardens Hills
Fresh Meadows
Briarwood
Jamaica Center
Oakland Gardens
Queens Village
Hollis
South Jamaica
St. Albans
Rochdale
Springfield Gardens
Cambria Heights
Rosedale
Far Rockaway
Broad Channel
Breezy Point
Steinway
Beechhurst
Bay Terrace
Edgemere
Arverne
Seaside
Neponsit
Murray Hill
Floral Park
Holliswood
Jamaica Estates
Queensboro Hill
Hillcrest
Ravenswood
Lindenwood
Laurelton
Lefrak City
Belle Harbor
Rockaway Park
Somerville
Brookville
Bellaire
North Corona
Forest Hills Gardens
Jamaica Hills
Utopia
Pomonok
Astoria Heights
Hunters Point
Sunnyside Gardens
Blissville
Roxbury
Middle Village
Malba


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Astoria,40.768509,-73.915654,Favela Grill,40.767348,-73.917897,Brazilian Restaurant
1,Astoria,40.768509,-73.915654,Titan Foods Inc.,40.769198,-73.919253,Gourmet Shop
2,Astoria,40.768509,-73.915654,CrossFit Queens,40.769404,-73.918977,Gym
3,Astoria,40.768509,-73.915654,Sitan Muay Thai,40.766108,-73.913224,Martial Arts Dojo
4,Astoria,40.768509,-73.915654,Al-sham Sweets and Pastries,40.768077,-73.911561,Middle Eastern Restaurant


In [87]:
colnames = ['Neighbourhood', 'Neighbourhood Latitude', 'Neighbourhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
NYC_venues = pd.read_csv('NYC_venues.csv', skiprows=1, names=colnames)
NYC_venues.head(5)

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Astoria,40.768509,-73.915654,Favela Grill,40.767348,-73.917897,Brazilian Restaurant
1,Astoria,40.768509,-73.915654,Titan Foods Inc.,40.769198,-73.919253,Gourmet Shop
2,Astoria,40.768509,-73.915654,CrossFit Queens,40.769404,-73.918977,Gym
3,Astoria,40.768509,-73.915654,Sitan Muay Thai,40.766108,-73.913224,Martial Arts Dojo
4,Astoria,40.768509,-73.915654,Al-sham Sweets and Pastries,40.768077,-73.911561,Middle Eastern Restaurant


In [88]:
NYC_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Arverne,36,36,36,36,36,36
Astoria,100,100,100,100,100,100
Astoria Heights,88,88,88,88,88,88
Auburndale,100,100,100,100,100,100
Bay Terrace,64,64,64,64,64,64
Bayside,100,100,100,100,100,100
Beechhurst,52,52,52,52,52,52
Bellaire,62,62,62,62,62,62
Belle Harbor,27,27,27,27,27,27
Bellerose,55,55,55,55,55,55


### One Hot Encoding


In [89]:
NYC_onehot = pd.get_dummies(NYC_venues[['Venue Category']], prefix="", prefix_sep="")
column_names = ['Neighbourhood'] + list(NYC_onehot.columns)
NYC_onehot['Neighbourhood'] = NYC_venues['Neighbourhood'] 
NYC_onehot = NYC_onehot[column_names]

NYC_onehot.head(10)

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Art Gallery,...,Volleyball Court,Warehouse Store,Waste Facility,Weight Loss Center,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Astoria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Astoria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Astoria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Astoria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Astoria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Astoria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Astoria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Astoria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Astoria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Astoria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
ind_restaurants = []
search = 'Indian Restaurant'
for i in NYC_onehot.columns :
    if search in i:
        ind_restaurants.append(i)
        
ind_restaurants

['Indian Restaurant']

In [91]:
col_name = []
col_name = ['Neighbourhood'] + ind_restaurants
NYC_rest = NYC_onehot[col_name]

NYC_rest_grp = NYC_rest.groupby('Neighbourhood').sum().reset_index()
NYC_rest_grp['Total'] = NYC_rest_grp.sum(axis=1)

NYC_rest_grp.head(10)

Unnamed: 0,Neighbourhood,Indian Restaurant,Total
0,Arverne,0,0
1,Astoria,1,1
2,Astoria Heights,1,1
3,Auburndale,0,0
4,Bay Terrace,0,0
5,Bayside,3,3
6,Beechhurst,0,0
7,Bellaire,1,1
8,Belle Harbor,0,0
9,Bellerose,0,0


### Clustering neighbourhoods and examining these clusters


Using silhouette coefficient model to get optimum k-value for KMeans clustering. The higher the silhouette coefficient, the more similar the objects in each cluster are.

In [93]:
NYC_cluster = NYC_rest_grp.drop('Neighbourhood', 1)

for n_cluster in range(2, 10):
    kmeans = KMeans(n_clusters=n_cluster).fit(NYC_cluster)
    label = kmeans.labels_
    sil_coeff = silhouette_score(NYC_cluster, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

For n_clusters=2, The Silhouette Coefficient is 0.8416450996729327
For n_clusters=3, The Silhouette Coefficient is 0.827920816163456
For n_clusters=4, The Silhouette Coefficient is 0.9199402865337593
For n_clusters=5, The Silhouette Coefficient is 0.912491122974994
For n_clusters=6, The Silhouette Coefficient is 0.931135531135531
For n_clusters=7, The Silhouette Coefficient is 0.9392551892551894
For n_clusters=8, The Silhouette Coefficient is 0.9423076923076923
For n_clusters=9, The Silhouette Coefficient is 0.9358974358974359


In [94]:
# Using 6 clusters and performing KMeans clustering on the dataset

kclusters = 6
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(NYC_cluster)

kmeans.labels_

array([1, 3, 3, 1, 1, 0, 1, 3, 1, 1, 3, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 3,
       1, 2, 1, 1, 3, 5, 1, 1, 1, 3, 1, 1, 5, 5, 0, 5, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 3, 4, 3, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 3, 3, 3, 1, 1, 3, 4])

In [95]:
result_df = pd.DataFrame(kmeans.cluster_centers_)
result_df.columns = NYC_cluster.columns
result_df.index = ['cluster0','cluster1','cluster2','cluster3','cluster4','cluster5']
result_df['Total Sum'] = result_df.sum(axis = 1)
result_df

Unnamed: 0,Indian Restaurant,Total,Total Sum
cluster0,2.8,2.8,5.6
cluster1,5.551115e-16,5.551115e-16,1.110223e-15
cluster2,9.0,9.0,18.0
cluster3,1.0,1.0,2.0
cluster4,6.5,6.5,13.0
cluster5,4.25,4.25,8.5


Cluster 1 has the smallest total value, and hence belongs to the most unsaturated areas wrt Indian Restaurants.

In [96]:
NYC_results = pd.DataFrame(NYC_rest_grp['Neighbourhood'],)

NYC_results['Total'] = NYC_rest_grp['Total']
NYC_results = NYC_results.assign(Cluster_Labels = kmeans.labels_)
NYC_results.head(10)

Unnamed: 0,Neighbourhood,Total,Cluster_Labels
0,Arverne,0,1
1,Astoria,1,3
2,Astoria Heights,1,3
3,Auburndale,0,1
4,Bay Terrace,0,1
5,Bayside,3,0
6,Beechhurst,0,1
7,Bellaire,1,3
8,Belle Harbor,0,1
9,Bellerose,0,1


In [97]:
# Combining cluster and geo data

NYC_final = queens_df
NYC_final = NYC_final.join(NYC_results.set_index('Neighbourhood'), on='Neighbourhood')
print(NYC_final.shape)
NYC_final.head(10) 

(78, 6)


Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Total,Cluster_Labels
0,Queens,Astoria,40.768509,-73.915654,1,3
1,Queens,Woodside,40.746349,-73.901842,6,4
2,Queens,Jackson Heights,40.751981,-73.882821,5,5
3,Queens,Elmhurst,40.744049,-73.881656,1,3
4,Queens,Howard Beach,40.654225,-73.838138,0,1
5,Queens,South Corona,40.742382,-73.856825,0,1
6,Queens,Forest Hills,40.725264,-73.844475,0,1
7,Queens,Kew Gardens,40.705179,-73.829819,2,0
8,Queens,Richmond Hill,40.697947,-73.831833,7,4
9,Queens,Downtown Flushing,40.761164,-73.829368,0,1


In [98]:
# Superimposing clusters on the map

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(NYC_final['Latitude'], NYC_final['Longitude'], NYC_final['Neighbourhood'], NYC_final['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [99]:
# Unsaturated clusters
NYC_unsat=NYC_final[NYC_final['Cluster_Labels']==1].reset_index(drop=True)
NYC_unsat

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Total,Cluster_Labels
0,Queens,Howard Beach,40.654225,-73.838138,0,1
1,Queens,South Corona,40.742382,-73.856825,0,1
2,Queens,Forest Hills,40.725264,-73.844475,0,1
3,Queens,Downtown Flushing,40.761164,-73.829368,0,1
4,Queens,East Elmhurst,40.764073,-73.867041,0,1
5,Queens,Maspeth,40.725427,-73.896217,0,1
6,Queens,Glendale,40.702762,-73.870742,0,1
7,Queens,South Ozone Park,40.66855,-73.809865,0,1
8,Queens,College Point,40.784903,-73.843045,0,1
9,Queens,Whitestone,40.781291,-73.814202,0,1


In [100]:
# Superimposing unsaturated clusters and farmers market data onto map of NYC

address = 'New York City, NY'
geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_clusters_final = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, FacilityName, borough in zip(fm_df['Latitude'], fm_df['Longitude'], fm_df['Market Name'], fm_df['Borough']):
            label = '{}, {}'.format('Market Name', 'Borough')
            label = folium.Popup(label, parse_html=True)
            folium.CircleMarker(
                [lat, lng],
                radius=5,
                popup=label,
                color='green',
                fill=True,
                fill_color='green',
                fill_opacity=0.7,
                parse_html = False).add_to(map_clusters_final)  

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(NYC_unsat['Latitude'], NYC_unsat['Longitude'], NYC_unsat['Neighbourhood'], NYC_unsat['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters_final)
       
map_clusters_final

### XYZ decides Downtown Flushing is where Chef QWERTY should open his first restaurant