## Capstone Project - The Battle of Neighborhoods

#### Import Libraries

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import urllib.request
import json
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as colors
%matplotlib inline
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

#### Download and Explore Dataset

In [3]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

Data downloaded!


In [5]:
neighborhoods_data = newyork_data['features']
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [6]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [7]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [8]:
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


In [9]:
import folium
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Borough'], manhattan_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

#### Get venues from Foursquare

In [10]:
import urllib
def getNearbyVenues(names, latitudes, longitudes, radius=5000, categoryIds=''):
    try:
        venues_list=[]
        for name, lat, lng in zip(names, latitudes, longitudes):
            #print(name)

            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)

            if (categoryIds != ''):
                url = url + '&categoryId={}'
                url = url.format(categoryIds)

            # make the GET request
            response = requests.get(url).json()
            results = response["response"]['venues']

            # return only relevant information for each nearby venue
            for v in results:
                success = False
                try:
                    category = v['categories'][0]['name']
                    success = True
                except:
                    pass

                if success:
                    venues_list.append([(
                        name, 
                        lat, 
                        lng, 
                        v['name'], 
                        v['location']['lat'], 
                        v['location']['lng'],
                        v['categories'][0]['name']
                    )])

        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',  
                  'Venue Category']
    
    except:
        print(url)
        print(response)
        print(results)
        print(nearby_venues)

    return(nearby_venues)

In [11]:
LIMIT = 500 
radius = 5000 
CLIENT_ID = '1OYRVFGT1SGDQKEICCQGC2ICNWQFJTZFXQCIMGZFAGCDYIV3'
CLIENT_SECRET = 'Q5T0V1JXSQB5PUBNB5ZE44DZ4PLSNYCPT5HGWMMMUS4ZI22J'
VERSION = '20190701'

In [13]:
#Vietnamese restaurant = 4bf58dd8d48988d1d2941735
neighborhoods = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
newyork_venues_Vietnamese_restaurant = getNearbyVenues(names=neighborhoods['Neighborhood'], latitudes=neighborhoods['Latitude'], longitudes=neighborhoods['Longitude'], radius=1000, categoryIds='4bf58dd8d48988d14a941735')
newyork_venues_Vietnamese_restaurant.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Moon Star Grill,40.886238,-73.909961,Asian Restaurant
1,Chinatown,40.715618,-73.994279,Bêp Gà,40.717226,-73.993826,Vietnamese Restaurant
2,Chinatown,40.715618,-73.994279,Pho Vietnam,40.71753,-73.994128,Vietnamese Restaurant
3,Chinatown,40.715618,-73.994279,Saigon V-Bread Cafe LLC,40.714635,-73.99309,Vietnamese Restaurant
4,Chinatown,40.715618,-73.994279,Van Da,40.723477,-73.983016,Vietnamese Restaurant


In [14]:
newyork_venues_Vietnamese_restaurant.shape

(542, 7)

In [15]:
def addToMap(df, color, existingMap):
    for lat, lng, local, venue, venueCat in zip(df['Venue Latitude'], df['Venue Longitude'], df['Neighborhood'], df['Venue'], df['Venue Category']):
        label = '{} ({}) - {}'.format(venue, venueCat, local)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7).add_to(existingMap)

In [16]:
map_Vietnamese_restaurant = folium.Map(location=[latitude, longitude], zoom_start=10)
addToMap(newyork_venues_Vietnamese_restaurant, 'red', map_Vietnamese_restaurant)

map_Vietnamese_restaurant

In [17]:
def addColumn(startDf, columnTitle, dataDf):
    grouped = dataDf.groupby('Neighborhood').count()
    
    for n in startDf['Neighborhood']:
        try:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = grouped.loc[n, 'Venue']
        except:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = 0

In [18]:
manhattan_grouped = newyork_venues_Vietnamese_restaurant.groupby('Neighborhood').count()
manhattan_grouped
#print('There are {} uniques categories.'.format(len(newyork_venues_Vietnamese_restaurant['Venue Category'].unique())))

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Battery Park City,7,7,7,7,7,7
Carnegie Hill,4,4,4,4,4,4
Chelsea,6,6,6,6,6,6
Chinatown,43,43,43,43,43,43
Civic Center,40,40,40,40,40,40
Clinton,7,7,7,7,7,7
East Village,32,32,32,32,32,32
Financial District,9,9,9,9,9,9
Flatiron,15,15,15,15,15,15
Gramercy,23,23,23,23,23,23


#### Analyze Each Neighborhood

In [19]:
# one hot encoding
manhattan_onehot = pd.get_dummies(newyork_venues_Vietnamese_restaurant[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] =newyork_venues_Vietnamese_restaurant['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

manhattan_onehot.head()

Unnamed: 0,Neighborhood,Asian Restaurant,Chinese Restaurant,Food Truck,Middle Eastern Restaurant,Noodle House,Sandwich Place,Thai Restaurant,Vietnamese Restaurant
0,Marble Hill,1,0,0,0,0,0,0,0
1,Chinatown,0,0,0,0,0,0,0,1
2,Chinatown,0,0,0,0,0,0,0,1
3,Chinatown,0,0,0,0,0,0,0,1
4,Chinatown,0,0,0,0,0,0,0,1


In [29]:
manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()
manhattan_grouped

Unnamed: 0,Neighborhood,Asian Restaurant,Chinese Restaurant,Food Truck,Middle Eastern Restaurant,Noodle House,Sandwich Place,Thai Restaurant,Vietnamese Restaurant
0,Battery Park City,0.0,0.0,0.142857,0.0,0.0,0.0,0.142857,0.714286
1,Carnegie Hill,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Chelsea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Chinatown,0.0,0.069767,0.0,0.0,0.023256,0.046512,0.0,0.860465
4,Civic Center,0.0,0.075,0.0,0.0,0.025,0.05,0.0,0.85
5,Clinton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,East Village,0.0625,0.03125,0.0,0.0,0.0,0.0,0.03125,0.875
7,Financial District,0.0,0.0,0.111111,0.0,0.0,0.0,0.111111,0.777778
8,Flatiron,0.066667,0.066667,0.0,0.0,0.0,0.0,0.066667,0.8
9,Gramercy,0.043478,0.086957,0.0,0.0,0.0,0.0,0.086957,0.782609


In [30]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [82]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = manhattan_grouped['Neighborhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Battery Park City,Vietnamese Restaurant,Thai Restaurant,Food Truck,Sandwich Place,Noodle House
1,Carnegie Hill,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
2,Chelsea,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
3,Chinatown,Vietnamese Restaurant,Chinese Restaurant,Sandwich Place,Noodle House,Thai Restaurant
4,Civic Center,Vietnamese Restaurant,Chinese Restaurant,Sandwich Place,Noodle House,Thai Restaurant


Cluster Neighborhoods

In [83]:
# set number of clusters
kclusters = 5

manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 2, 2, 0, 0, 2, 0, 3, 4, 4], dtype=int32)

In [84]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = manhattan_data
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

manhattan_merged.dropna(inplace=True)
manhattan_merged["Cluster Labels"].astype(int)
manhattan_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Manhattan,Marble Hill,40.876551,-73.91066,1.0,Asian Restaurant,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House
1,Manhattan,Chinatown,40.715618,-73.994279,0.0,Vietnamese Restaurant,Chinese Restaurant,Sandwich Place,Noodle House,Thai Restaurant
8,Manhattan,Upper East Side,40.775639,-73.960508,2.0,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
9,Manhattan,Yorkville,40.77593,-73.947118,2.0,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
10,Manhattan,Lenox Hill,40.768113,-73.95886,2.0,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant


In [None]:
# create map
map_clusters = folium.Map(location=[40.715618, -73.994279], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [75]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,Chinatown,Vietnamese Restaurant,Chinese Restaurant,Sandwich Place,Noodle House,Thai Restaurant
15,Midtown,Vietnamese Restaurant,Thai Restaurant,Middle Eastern Restaurant,Sandwich Place,Noodle House
18,Greenwich Village,Vietnamese Restaurant,Sandwich Place,Chinese Restaurant,Thai Restaurant,Asian Restaurant
19,East Village,Vietnamese Restaurant,Asian Restaurant,Thai Restaurant,Chinese Restaurant,Sandwich Place
20,Lower East Side,Vietnamese Restaurant,Thai Restaurant,Noodle House,Chinese Restaurant,Asian Restaurant
21,Tribeca,Vietnamese Restaurant,Sandwich Place,Chinese Restaurant,Thai Restaurant,Noodle House
22,Little Italy,Vietnamese Restaurant,Chinese Restaurant,Sandwich Place,Noodle House,Thai Restaurant
23,Soho,Vietnamese Restaurant,Chinese Restaurant,Sandwich Place,Noodle House,Thai Restaurant
24,West Village,Vietnamese Restaurant,Asian Restaurant,Thai Restaurant,Sandwich Place,Noodle House
31,Noho,Vietnamese Restaurant,Chinese Restaurant,Sandwich Place,Thai Restaurant,Noodle House


In [76]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 1, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Marble Hill,Asian Restaurant,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House


In [77]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 2, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
8,Upper East Side,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
9,Yorkville,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
10,Lenox Hill,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
11,Roosevelt Island,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
12,Upper West Side,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
13,Lincoln Square,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
14,Clinton,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
17,Chelsea,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
25,Manhattan Valley,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant
26,Morningside Heights,Vietnamese Restaurant,Thai Restaurant,Sandwich Place,Noodle House,Middle Eastern Restaurant


In [78]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 3, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
28,Battery Park City,Vietnamese Restaurant,Thai Restaurant,Food Truck,Sandwich Place,Noodle House
29,Financial District,Vietnamese Restaurant,Thai Restaurant,Food Truck,Sandwich Place,Noodle House


In [79]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 4, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
16,Murray Hill,Vietnamese Restaurant,Thai Restaurant,Chinese Restaurant,Sandwich Place,Noodle House
27,Gramercy,Vietnamese Restaurant,Thai Restaurant,Chinese Restaurant,Asian Restaurant,Sandwich Place
34,Sutton Place,Vietnamese Restaurant,Thai Restaurant,Middle Eastern Restaurant,Sandwich Place,Noodle House
36,Tudor City,Vietnamese Restaurant,Thai Restaurant,Chinese Restaurant,Sandwich Place,Noodle House
37,Stuyvesant Town,Vietnamese Restaurant,Thai Restaurant,Asian Restaurant,Sandwich Place,Noodle House
38,Flatiron,Vietnamese Restaurant,Thai Restaurant,Chinese Restaurant,Asian Restaurant,Sandwich Place
