# Capstone Final Project - The battle of the neighborhoods

## Finding the best location for a Italian restaurant supplier in New York City

### Step 1. Set up

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


#### Step 1.1 Import Data for New York City

In [2]:
#get a file from a posted location
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

Data downloaded!


In [4]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [7]:
#take a look at the data and then comment the line out to clean up the notebook
#newyork_data

#### Step 1.2 Data Exploration

In [8]:
#relevant data is in the features key
neighborhoods_data = newyork_data['features']

#### Step 1.3 Transform into a Pandas Dataframe

In [10]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

#### Step 1.4 Loop through the data one row at time

In [12]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [13]:
#examine the data
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [14]:
# Check the results are correct with 5 boroughs and 306 neighborhoods
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


#### Step 1.5 Map the neighborhoods 

In [15]:
#Use geopy library to get latitude and longitude values for New York City.
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


#### Step 1.6 Use Folium to create a map of New York with neighborhood markers

In [17]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='orange',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

### Step 2 Add Neighborhood Data

#### Step 2.1 Limit data to Manhattan

In [55]:
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


#### Step 2.2 Use Foursquare to get neighborhood data

In [19]:
CLIENT_ID = 'HDLIBLSOU5KHWQHHUKKDXMHRV3RP45WA1Z2N5XA35UZR5SIN' # your Foursquare ID
CLIENT_SECRET = 'OHRZ5ZMTGJV02X5TQ1WCEXWQONH1SALSNTUCAYEGCQQP5L0R' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: HDLIBLSOU5KHWQHHUKKDXMHRV3RP45WA1Z2N5XA35UZR5SIN
CLIENT_SECRET:OHRZ5ZMTGJV02X5TQ1WCEXWQONH1SALSNTUCAYEGCQQP5L0R


#### The following code was not used

In [32]:
search_query = 'Italian'
radius = 500 # define radius
LIMIT = 300 # limit of number of venues returned by Foursquare API
print(search_query + ' .... OK!')

Italian .... OK!


In [38]:
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, 500,300)
url

'https://api.foursquare.com/v2/venues/search?client_id=HDLIBLSOU5KHWQHHUKKDXMHRV3RP45WA1Z2N5XA35UZR5SIN&client_secret=OHRZ5ZMTGJV02X5TQ1WCEXWQONH1SALSNTUCAYEGCQQP5L0R&ll=40.7127281,-74.0060152&v=20180605&query=Italian&radius=500&limit=300'

In [40]:
results = requests.get(url).json()
#results

In [41]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [42]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

KeyError: 'groups'

#### Step 2.3 Create a function to get all the neighborhood data for New York

In [75]:
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            500,
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue_Category']
    
    return(nearby_venues)

#### Step 2.4 Now create a new dataframe called manhattan_venues

In [76]:
#this will print out a list of manhattan neighborhoods
manhattan_venues = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


In [96]:
#Check the dataframe
print(manhattan_venues.shape)
manhattan_venues.head()

(2975, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue_Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,40.876551,-73.91066,Dunkin',40.877136,-73.906666,Donut Shop
4,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,Coffee Shop


#### Step 2.5 Important - Limit the data to only Italian restarants

In [None]:
#Pizza places are also supplied by Italian restaurant supply !!!
italian_venues = manhattan_venues[manhattan_venues['Venue_Category'].str.contains('Pizza|Italian')]
#italian_venues = manhattan_venues[manhattan_venues.Venue_Category.contains(["Pizza", "Italian"])]


In [119]:
#check data
italian_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue_Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,Pizza Place
30,Chinatown,40.715618,-73.994279,Scarr's Pizza,40.715335,-73.991649,Pizza Place
128,Washington Heights,40.851903,-73.9369,Saggio Restaurant,40.851423,-73.939761,Italian Restaurant
148,Washington Heights,40.851903,-73.9369,Fresco's Pizzeria,40.855202,-73.937216,Pizza Place
190,Washington Heights,40.851903,-73.9369,Exclusive Pizza,40.850989,-73.938635,Pizza Place


In [112]:
#See how many Italian locations there are per neighborhood
italian_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue_Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Battery Park City,3,3,3,3,3,3
Carnegie Hill,6,6,6,6,6,6
Central Harlem,1,1,1,1,1,1
Chelsea,4,4,4,4,4,4
Chinatown,1,1,1,1,1,1
Civic Center,1,1,1,1,1,1
Clinton,5,5,5,5,5,5
East Harlem,2,2,2,2,2,2
East Village,8,8,8,8,8,8
Financial District,4,4,4,4,4,4


In [113]:
#just checking my work, should be 2
print('There are {} uniques categories.'.format(len(italian_venues['Venue_Category'].unique())))

There are 2 uniques categories.


##### So to search for restaurant supply locations by neighborhood, the top five neighborhoods for Italian/Pizza places is: Noho, Greenwich Village, Lenox Hill, Upper East Side and Yorkville

## 3. Cluster Italian and Pizza Places

##### to find the best supply locations, use K means to find the most convenient central hub for 6 locations

In [114]:
# I sliced the orignial dataframe leaving only lat lon information for the restaurants
LatLon = italian_venues[['Venue Latitude', 'Venue Longitude']]

In [115]:
#checking work
LatLon.shape

(195, 2)

In [116]:
# set number of clusters
kclusters = 6

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(LatLon)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 4, 2, 2, 2, 2, 2, 2, 5, 5], dtype=int32)

In [117]:
# add clustering labels
LatLon.insert(0, 'Cluster Labels', kmeans.labels_)

italian_venues_merged = italian_venues

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
italian_venues_merged = italian_venues_merged.join(italian_venues.set_index('Neighborhood'), on='Neighborhood')

LatLon.head() # check the data

Unnamed: 0,Cluster Labels,Venue Latitude,Venue Longitude
0,2,40.874412,-73.910271
30,4,40.715335,-73.991649
128,2,40.851423,-73.939761
148,2,40.855202,-73.937216
190,2,40.850989,-73.938635


In [122]:
LatLon.shape

(195, 3)

#### Step 3.1 Create a Map of the Clusters

In [123]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(LatLon['Venue Latitude'], LatLon['Venue Longitude'],  LatLon['Cluster Labels']):
    #label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        #popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Step 3.2 Find the centroids of the clusters - that will be the most central location for restaurant supply distribution

In [175]:
arr = np.array(kmeans.cluster_centers_)

Centroids = pd.DataFrame(data=arr)

Centroids.head(9)

Unnamed: 0,0,1
0,40.724716,-74.003523
1,40.774666,-73.957426
2,40.861437,-73.927276
3,40.754432,-73.983737
4,40.728416,-73.985932
5,40.816091,-73.953878


#### Step 3.3 Map the Centroids

In [177]:
# create map of New York using centroid latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng in zip(Centroids[0], Centroids[1]):
    #label = '{}, {}'.format(neighborhood, borough)
    #label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        #popup=label,
        color='purple',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [140]:
type(centers)

numpy.ndarray

#### Step 3.4 Reverse Geocode the Centroids

##### This code initially worked, but then started giving me incorrect addresses. I was not able to solve the problem

In [156]:
rows = centers.shape[0]
cols = centers.shape[1]

for i in range(0, rows):
        print(centers[i, 0], centers[i,1])

40.724716345267396 -74.00352308221301
40.77466568143069 -73.95742617448214
40.86143683422034 -73.92727560110009
40.75443247023935 -73.9837369509181
40.72841580966651 -73.98593219832586
40.81609097346036 -73.95387773951502


In [183]:
#from geopy.geocoders import Nominatim
#geolocator = Nominatim
    
rows = centers.shape[0]

#for i in range(0, rows):
location = geolocator.reverse(40.87, -73.91)
print(location.address)
#print(centers[0, 0], centers[0,1])

Las Artigas, Ráfales / Ràfels, Matarraña / Matarranya, Teruel, Aragón, 44589, España


#### Step 3.5 Final Step - Hand real estate agent the centroid map as a starting point for supply distribution location