### Import libraries and declare Foursquare credentials

In [1]:
import pandas as pd # library for data analsysis
import json # library to handle JSON files
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import numpy as np # library to handle data in a vectorized manner
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium # map rendering library

In [2]:
CLIENT_ID = '*redacted*' # your Foursquare ID
CLIENT_SECRET = '*redacted*' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version



### Get foursquare data

### Create functions to search API
'getNearbyVenues' explores venues nearby the lat/long area, returns Dataframe with appended venue location and type
'getVenues' explicitly looks for transit/hotels near lat/long area, returns Dataframe with appended venue location and type

In [3]:
def getVenues(names, latitudes, longitudes, radius, categoryId):
    i= 0
    venues_list=[]
    #for x in range (0): 
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)

        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lng, 
        radius, 
        #LIMIT,
        categoryId        
        )

        # make the GET request
        results = requests.get(url).json()["response"]['venues']#['groups'][0]['items']
        i = i+1
        # return only relevant information for each nearby venue
        venues_list.append([(
        name, 
        lat, 
        lng,
        categoryId,
        v['name'], 
        v['location']['lat'], 
        v['location']['lng'],  
        v['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
              'Neighborhood Latitude', 
              'Neighborhood Longitude', 
              'Category',  
              'Venue',
              'Venue Latitude', 
              'Venue Longitude', 
              'Venue Category']
    
    return(nearby_venues)

In [4]:
def getNearbyVenues(names, latitudes, longitudes, radius, section):
    i= 0
    venues_list=[]
    for x in range (0,1): 
        for name, lat, lng in zip(names, latitudes, longitudes):
            print(name)
            
            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&section={}&offset={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                lat, 
                lng, 
                radius, 
               # LIMIT,
                section,
                i
                )
            
            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']
            i = i+1
            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng,
                section,
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
             
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Category',  
                  'Venue',
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Define query data
(borough data obtained via google. Category IDs from the Foursquare API dev page)

In [5]:
#Put NYC borough data into a DF
city_data = [['manhattan', 40.7831, -73.9712],['staten island',40.5795, -74.1502],['queens',40.7282, -73.7949],['the bronx',40.8448, -73.8648],['brooklyn', 40.6782, -73.9442] ]
borough_values = pd.DataFrame(city_data, columns=['Name', 'Lat', 'Long'])

#Set category ID for hotels and transport since API requires ID for searching
hotel_id = '4bf58dd8d48988d1fa931735'
trans_id = '4d4b7105d754a06379d81259'


### Run the functions
Generate Dataframes for comparision

In [6]:
City_venues_transport = getVenues(names = borough_values['Name'],
                                   latitudes = borough_values['Lat'],
                                   longitudes = borough_values['Long'],
                                   radius = 15000,
                                   categoryId = trans_id
                                  )

City_venues_hotel = getVenues(names = borough_values['Name'],
                                   latitudes = borough_values['Lat'],
                                   longitudes = borough_values['Long'],
                                   radius = 15000,
                                   categoryId = hotel_id
                                  )



manhattan
staten island
queens
the bronx
brooklyn
manhattan
staten island
queens
the bronx
brooklyn


In [7]:
#Get from API into dataframes.
#Food and Activities use getNearbyVenues function as their categories can be searched using the API's 'explore' value
#Hotel and Transport use getVenues function as their categories need to be explicitly searched for using the API's 'search' value
City_venues_food = getNearbyVenues(names = borough_values['Name'],
                                   latitudes = borough_values['Lat'],
                                   longitudes = borough_values['Long'],
                                   radius = 15000,
                                   section = 'food'
                                  )


City_venues_activities = getNearbyVenues(names = borough_values['Name'],
                                   latitudes = borough_values['Lat'],
                                   longitudes = borough_values['Long'],
                                   radius = 15000,
                                   section = 'Arts & Entertainment'
                                  )

manhattan
staten island
queens
the bronx
brooklyn
manhattan
staten island
queens
the bronx
brooklyn


### Drop duplicate data

In [8]:
#Drop any duplicates that might have returned
City_venues_food2  = City_venues_food.drop_duplicates()
City_venues_activities2 = City_venues_activities.drop_duplicates()
City_venues_transport2 = City_venues_transport.drop_duplicates()
City_venues_hotel2 = City_venues_hotel.drop_duplicates()

### Plot venues on Map

In [9]:
#Map data points on map
address = 'New York City, New York'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_NYC = folium.Map(location=[latitude, longitude], zoom_start=10)


In [10]:
for lat, lng, borough, neighborhood in zip(City_venues_transport['Venue Latitude'], City_venues_transport['Venue Longitude'], City_venues_transport['Venue'], City_venues_transport['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NYC)  
  
map_NYC

### Prep data for grouping to determine most common venue by type:
Start with food venue

In [11]:
City_venues_food2.groupby('Neighborhood').count()
print('There are {} uniques categories.'.format(len(City_venues_food2['Venue Category'].unique())))


There are 44 uniques categories.


In [12]:
Food_onehot = pd.get_dummies(City_venues_food2[['Venue Category']], prefix="", prefix_sep="")
Food_onehot['Neighborhood'] = City_venues_food2['Neighborhood'] 
fixed_columns = [Food_onehot.columns[-1]] + list(Food_onehot.columns[:-1])
Food_onehot = Food_onehot[fixed_columns]

In [13]:

Food_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Arepa Restaurant,Bagel Shop,Bakery,Bistro,Burger Joint,Café,Cantonese Restaurant,Caribbean Restaurant,...,Restaurant,Salad Place,Seafood Restaurant,Southern / Soul Food Restaurant,Spanish Restaurant,Sushi Restaurant,Taco Place,Tapas Restaurant,Thai Restaurant,Vietnamese Restaurant
0,manhattan,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,manhattan,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,manhattan,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,manhattan,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,manhattan,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [14]:
food_grouped = Food_onehot.groupby('Neighborhood').mean().reset_index()
food_grouped

food_grouped.shape

(5, 45)

In [15]:
num_top_venues = 5
for hood in food_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = food_grouped[food_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----brooklyn----
                     venue  freq
0              Pizza Place  0.20
1  New American Restaurant  0.10
2       Italian Restaurant  0.10
3     Caribbean Restaurant  0.10
4               Donut Shop  0.07


----manhattan----
                venue  freq
0              Bakery  0.17
1  Seafood Restaurant  0.10
2    Sushi Restaurant  0.10
3         Pizza Place  0.10
4     Thai Restaurant  0.07


----queens----
                 venue  freq
0          Pizza Place  0.30
1    Korean Restaurant  0.10
2               Bakery  0.10
3  Dumpling Restaurant  0.10
4     Sushi Restaurant  0.07


----staten island----
                 venue  freq
0          Pizza Place  0.23
1   Italian Restaurant  0.17
2               Bakery  0.17
3  Japanese Restaurant  0.07
4         Burger Joint  0.07


----the bronx----
                 venue  freq
0          Pizza Place  0.17
1        Deli / Bodega  0.13
2   Italian Restaurant  0.10
3               Bakery  0.10
4  American Restaurant  0.07




In [16]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]	

### Declare varible to sort values, then sort

In [17]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
cv_food2 = pd.DataFrame(columns=columns)
cv_food2['Neighborhood'] = food_grouped['Neighborhood']

for ind in np.arange(cv_food2.shape[0]):
    cv_food2.iloc[ind, 1:] = return_most_common_venues(food_grouped.iloc[ind, :], num_top_venues)

cv_food2.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,brooklyn,Pizza Place,Caribbean Restaurant,Italian Restaurant,New American Restaurant,Donut Shop,Vietnamese Restaurant,Food Court,Israeli Restaurant,Japanese Restaurant,Latin American Restaurant
1,manhattan,Bakery,Sushi Restaurant,Seafood Restaurant,Pizza Place,French Restaurant,Thai Restaurant,Greek Restaurant,North Indian Restaurant,Café,Comfort Food Restaurant
2,queens,Pizza Place,Korean Restaurant,Bakery,Dumpling Restaurant,Greek Restaurant,Sushi Restaurant,Bagel Shop,Seafood Restaurant,Café,Tapas Restaurant
3,staten island,Pizza Place,Bakery,Italian Restaurant,Japanese Restaurant,Burger Joint,Fast Food Restaurant,Diner,Mexican Restaurant,Deli / Bodega,Indian Restaurant
4,the bronx,Pizza Place,Deli / Bodega,Italian Restaurant,Bakery,American Restaurant,Diner,Mexican Restaurant,Japanese Restaurant,Cuban Restaurant,Latin American Restaurant


### Run again for activities

In [18]:
Act_onehot = pd.get_dummies(City_venues_activities[['Venue Category']], prefix="", prefix_sep="")
Act_onehot['Neighborhood'] = City_venues_activities['Neighborhood'] 
fixed_columns = [Act_onehot.columns[-1]] + list(Act_onehot.columns[:-1])
Act_onehot = Act_onehot[fixed_columns]

In [19]:
Act_grouped = Act_onehot.groupby('Neighborhood').mean().reset_index()
Act_grouped

Act_grouped.shape

(5, 24)

In [20]:
num_top_venues = 5
for hood in Act_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Act_grouped[Act_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----brooklyn----
                 venue  freq
0          Music Venue  0.23
1              Theater  0.13
2            Rock Club  0.13
3        Movie Theater  0.10
4  Indie Movie Theater  0.07


----manhattan----
                   venue  freq
0             Art Museum  0.20
1           Concert Hall  0.13
2                Theater  0.10
3              Jazz Club  0.07
4  Performing Arts Venue  0.07


----queens----
           venue  freq
0  Movie Theater  0.30
1    Music Venue  0.27
2   Concert Hall  0.07
3         Museum  0.07
4     Art Museum  0.07


----staten island----
            venue  freq
0   Movie Theater  0.17
1          Museum  0.13
2         Theater  0.10
3     Music Venue  0.10
4  History Museum  0.07


----the bronx----
           venue  freq
0     Art Museum  0.23
1        Exhibit  0.20
2        Theater  0.10
3         Museum  0.10
4  Movie Theater  0.07




In [21]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
City_venues_activities = pd.DataFrame(columns=columns)
City_venues_activities['Neighborhood'] = Act_grouped['Neighborhood']

for ind in np.arange(City_venues_activities.shape[0]):
    City_venues_activities.iloc[ind, 1:] = return_most_common_venues(Act_grouped.iloc[ind, :], num_top_venues)

City_venues_activities.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,brooklyn,Music Venue,Theater,Rock Club,Movie Theater,Opera House,History Museum,Indie Movie Theater,Art Gallery,Performing Arts Venue,Street Art
1,manhattan,Art Museum,Concert Hall,Theater,Planetarium,Performing Arts Venue,Jazz Club,Exhibit,Dance Studio,Indie Movie Theater,Movie Theater
2,queens,Movie Theater,Music Venue,Museum,Art Museum,Concert Hall,Multiplex,Exhibit,Indie Movie Theater,Mini Golf,Street Art
3,staten island,Movie Theater,Museum,Theater,Music Venue,Art Museum,Concert Hall,History Museum,Art Gallery,Performing Arts Venue,Street Art
4,the bronx,Art Museum,Exhibit,Theater,Museum,Planetarium,Movie Theater,Concert Hall,History Museum,Street Art,Art Gallery


In [22]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
City_venues_food = pd.DataFrame(columns=columns)
City_venues_food['Neighborhood'] = food_grouped['Neighborhood']

for ind in np.arange(City_venues_activities.shape[0]):
    City_venues_food.iloc[ind, 1:] = return_most_common_venues(food_grouped.iloc[ind, :], num_top_venues)

City_venues_food.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,brooklyn,Pizza Place,Caribbean Restaurant,Italian Restaurant,New American Restaurant,Donut Shop,Vietnamese Restaurant,Food Court,Israeli Restaurant,Japanese Restaurant,Latin American Restaurant
1,manhattan,Bakery,Sushi Restaurant,Seafood Restaurant,Pizza Place,French Restaurant,Thai Restaurant,Greek Restaurant,North Indian Restaurant,Café,Comfort Food Restaurant
2,queens,Pizza Place,Korean Restaurant,Bakery,Dumpling Restaurant,Greek Restaurant,Sushi Restaurant,Bagel Shop,Seafood Restaurant,Café,Tapas Restaurant
3,staten island,Pizza Place,Bakery,Italian Restaurant,Japanese Restaurant,Burger Joint,Fast Food Restaurant,Diner,Mexican Restaurant,Deli / Bodega,Indian Restaurant
4,the bronx,Pizza Place,Deli / Bodega,Italian Restaurant,Bakery,American Restaurant,Diner,Mexican Restaurant,Japanese Restaurant,Cuban Restaurant,Latin American Restaurant


### Use K means clustering to see if there is a pattern in data
Declare variables and prep the data for clustering

In [23]:
kclusters = 5

food_grouped_clustering = food_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(food_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 2, 1, 4, 0], dtype=int32)

In [24]:
# add clustering labels
food_grouped.insert(0, 'Cluster Labels', kmeans.labels_)


### Take data and plot the points onto a map to show the clustering.

In [25]:
Food_merged = City_venues_food2
Food_merged = Food_merged.join(food_grouped.set_index('Neighborhood'), on='Neighborhood')
Food_merged.head() # check the last columns!


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Category,Venue,Venue Latitude,Venue Longitude,Venue Category,Cluster Labels,American Restaurant,...,Restaurant,Salad Place,Seafood Restaurant,Southern / Soul Food Restaurant,Spanish Restaurant,Sushi Restaurant,Taco Place,Tapas Restaurant,Thai Restaurant,Vietnamese Restaurant
0,manhattan,40.7831,-73.9712,food,Jacob's Pickles,40.786653,-73.975622,Southern / Soul Food Restaurant,2,0.033333,...,0.0,0.033333,0.1,0.033333,0.033333,0.1,0.033333,0.0,0.066667,0.0
1,manhattan,40.7831,-73.9712,food,Orwasher's Bakery,40.784555,-73.977389,Bakery,2,0.033333,...,0.0,0.033333,0.1,0.033333,0.033333,0.1,0.033333,0.0,0.066667,0.0
2,manhattan,40.7831,-73.9712,food,Levain Bakery,40.781513,-73.97926,Bakery,2,0.033333,...,0.0,0.033333,0.1,0.033333,0.033333,0.1,0.033333,0.0,0.066667,0.0
3,manhattan,40.7831,-73.9712,food,Levain Bakery,40.779836,-73.980519,Bakery,2,0.033333,...,0.0,0.033333,0.1,0.033333,0.033333,0.1,0.033333,0.0,0.066667,0.0
4,manhattan,40.7831,-73.9712,food,The Mermaid Inn,40.788744,-73.974243,Seafood Restaurant,2,0.033333,...,0.0,0.033333,0.1,0.033333,0.033333,0.1,0.033333,0.0,0.066667,0.0


In [26]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Food_merged['Venue Latitude'], Food_merged['Venue Longitude'], Food_merged['Neighborhood'], Food_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters