# RESTAURANT NEIGHBORHOOD ANALYSIS - ARLINGTON, VA

### Import Packages

In [1]:
!pip install folium==0.5.0
!pip install geopy
!pip install beautifulsoup4

Collecting folium==0.5.0
  Downloading folium-0.5.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 9.6 MB/s  eta 0:00:01
[?25hCollecting branca
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Created wheel for folium: filename=folium-0.5.0-py3-none-any.whl size=76240 sha256=f4819a341af31c9b0b085d998a0cae8bbb793756adf488425a2b8702fbca1c6e
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/b2/2f/2c/109e446b990d663ea5ce9b078b5e7c1a9c45cca91f377080f8
Successfully built folium
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.5.0


In [2]:
#General
import pandas as pd
import numpy as np
import requests

# Web Scraping
import bs4 as bs

# Geospacial Info - convert address to latitude and longitude
from geopy.geocoders import Nominatim

# Mapping
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# Clustering 
from sklearn.cluster import KMeans

### Import Data and Build Dataframe(s)

#### Web Scraping

In [3]:
# Scrape neighborhood names from internet

sauce = requests.get('https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Arlington_County,_Virginia').text
soup = bs.BeautifulSoup(sauce, 'lxml')

neighborhood_names = [neighborhood.text for neighborhood in soup.find('tbody').find_all('li')]

print('There are {} neigbhorhoods in Arlington, VA.'.format(len(neighborhood_names)))

neighborhood_names[0:15]

There are 73 neigbhorhoods in Arlington, VA.


['Alcova Heights',
 'Arlington Forest',
 'Arlington Heights',
 'Arlington Ridge',
 "Arlington View / Johnson's Hill",
 'Ashton Heights',
 'Aurora Highlands',
 'Aurora Hills',
 'Ballston',
 'Barcroft',
 'Bellevue Forest',
 'Bluemont',
 'Bon Air',
 'Boulevard Manor',
 'Brandon Village']

In [4]:
#Format neighborhood names as neeeded and determine latitude/longitude
latitude = []
longitude = []

for i, neighborhood in enumerate(neighborhood_names):
    if ' /' in neighborhood:
        neighborhood_names[i] = neighborhood.split(' /')[0]
    elif ' (' in neighborhood:
        neighborhood_names[i] = neighborhood.split(' (')[0]
    
    try:
        geolocator = Nominatim(user_agent='arlington_explorer')
        location = geolocator.geocode('{}, Arlington, VA'.format(neighborhood))
        latitude.append(location.latitude)
        longitude.append(location.longitude)
    except:
        latitude.append('N/A')
        longitude.append('N/A')
        
# Neighborhoods with additional names or info (post formatting)
for i in [4, 40, 49, 64]:
    print(neighborhood_names[i])

Arlington View
High View Park
Nauck
Waycroft-Woodlawn


In [5]:
ava_neighborhoods = pd.DataFrame(list(zip(neighborhood_names, latitude, longitude)), columns=['Neighborhood', 'Latitude', 'Longitude'])
pd.set_option('display.max_rows', None)

ava_neighborhoods

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Alcova Heights,38.8646,-77.0972
1,Arlington Forest,38.8689,-77.1131
2,Arlington Heights,41.0062,-75.2126
3,Arlington Ridge,40.9841,-81.4939
4,Arlington View,,
5,Ashton Heights,,
6,Aurora Highlands,38.8528,-77.0684
7,Aurora Hills,38.8515,-77.0641
8,Ballston,38.882,-77.1115
9,Barcroft,38.8559,-77.1039


In [6]:
#For simplity will filter neighborhoods down to those where Nominatim returned good latitudes/longitudes

ava_neighborhoods.drop(ava_neighborhoods[ava_neighborhoods['Latitude'] == "N/A"].index, inplace = True)

m = (ava_neighborhoods['Latitude'].between(38,39)) & (ava_neighborhoods['Longitude'].between(-78,-77))

ava_filtered = ava_neighborhoods[m].reset_index(drop=True)

ava_filtered

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Alcova Heights,38.8646,-77.0972
1,Arlington Forest,38.8689,-77.1131
2,Aurora Highlands,38.8528,-77.0684
3,Aurora Hills,38.8515,-77.0641
4,Ballston,38.882,-77.1115
5,Barcroft,38.8559,-77.1039
6,Bellevue Forest,38.9143,-77.1136
7,Bluemont,38.8747,-77.133
8,Bon Air,38.8732,-77.1266
9,Brandon Village,38.8757,-77.1158


In [7]:
print('The dataframe has {} neighborhoods.'.format(ava_filtered.shape[0]))

The dataframe has 50 neighborhoods.


#### Foursquare API

In [8]:
# The code was removed by Watson Studio for sharing.

In [9]:
VERSION = '20201208'
LIMIT = 100
radius = 500 #meters

In [10]:
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
        
    return(nearby_venues)

In [11]:
ava_venues = getNearbyVenues(names = ava_filtered['Neighborhood'], latitudes = ava_filtered['Latitude'], longitudes = ava_filtered['Longitude'])
print(ava_venues.shape)
ava_venues.head()

(924, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Alcova Heights,38.864557,-77.097201,Redbox,38.868374,-77.097198,Video Store
1,Alcova Heights,38.864557,-77.097201,Burger King,38.860737,-77.094868,Fast Food Restaurant
2,Alcova Heights,38.864557,-77.097201,7-Eleven,38.868449,-77.097067,Convenience Store
3,Alcova Heights,38.864557,-77.097201,El Ranchero Migueleno,38.86071,-77.095183,Mexican Restaurant
4,Alcova Heights,38.864557,-77.097201,Alcova Heights,38.861586,-77.10147,Basketball Court


In [12]:
ava_restaurant = ava_venues[ava_venues['Venue Category'].str.contains('Restaurant')]

ava_restaurant.shape

(181, 7)

In [14]:
# Use only neighborhoods where there are 5 or more restaurants
ava_restaurant_count = ava_restaurant.groupby('Neighborhood').count()
temp = ava_restaurant_count[ava_restaurant_count['Venue']>=5]
select_neighborhoods = temp.index.tolist()

ava_restaurant_filtered = ava_restaurant[ava_restaurant['Neighborhood'].isin(select_neighborhoods)]

### Restructure Data

In [15]:
#one hot encoding to convert categorial values into numerical ones
ava_onehot = pd.get_dummies(ava_restaurant_filtered[['Venue Category']], prefix="", prefix_sep="")

ava_onehot['Neighborhood'] = ava_restaurant['Neighborhood']

fixed_columns = [ava_onehot.columns[-1]] + list(ava_onehot.columns[:-1])
ava_onehot = ava_onehot[fixed_columns]

ava_onehot.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,American Restaurant,Caribbean Restaurant,Chinese Restaurant,Eastern European Restaurant,Ethiopian Restaurant,Fast Food Restaurant,Filipino Restaurant,French Restaurant,...,Restaurant,Russian Restaurant,Seafood Restaurant,South American Restaurant,Spanish Restaurant,Sushi Restaurant,Szechuan Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
27,Ballston,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
28,Ballston,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,Ballston,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31,Ballston,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32,Ballston,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
print('There are {} different venues in these Arlington neighborhoods, and a total of {} different restaurant categories.'.format(ava_onehot.shape[0], ava_onehot.shape[1]))

There are 144 different venues in these Arlington neighborhoods, and a total of 34 different restaurant categories.


In [17]:
# calculate the mean of each restaurant type for each neighborhood
ava_grouped = ava_onehot.groupby('Neighborhood').mean().reset_index()
ava_grouped

Unnamed: 0,Neighborhood,Afghan Restaurant,American Restaurant,Caribbean Restaurant,Chinese Restaurant,Eastern European Restaurant,Ethiopian Restaurant,Fast Food Restaurant,Filipino Restaurant,French Restaurant,...,Restaurant,Russian Restaurant,Seafood Restaurant,South American Restaurant,Spanish Restaurant,Sushi Restaurant,Szechuan Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,Ballston,0.0,0.166667,0.0,0.055556,0.0,0.0,0.055556,0.055556,0.0,...,0.111111,0.055556,0.0,0.0,0.055556,0.055556,0.0,0.0,0.0,0.0
1,Buckingham,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0
2,Carlin Springs,0.0,0.3,0.0,0.0,0.0,0.0,0.1,0.0,0.0,...,0.1,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
3,Claremont,0.0,0.2,0.0,0.2,0.0,0.0,0.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Clarendon,0.0,0.214286,0.0,0.0,0.071429,0.0,0.0,0.0,0.071429,...,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.142857
5,Columbia Heights,0.0,0.222222,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222222,0.0,0.0
6,Garden City,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.111111,0.0,0.0
7,High View Park,0.166667,0.166667,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Lyon Park,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
9,Pentagon City,0.0,0.111111,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,...,0.111111,0.0,0.111111,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111


In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    
    
    
    return row_categories_sorted.index.values[0: num_top_venues]

In [19]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ava_grouped['Neighborhood']

for ind in np.arange(ava_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ava_grouped.iloc[ind, :], num_top_venues)
    
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Ballston,American Restaurant,Mexican Restaurant,Mediterranean Restaurant,Restaurant,Indian Restaurant
1,Buckingham,Latin American Restaurant,Mexican Restaurant,Chinese Restaurant,Middle Eastern Restaurant,Mediterranean Restaurant
2,Carlin Springs,American Restaurant,Mexican Restaurant,Fast Food Restaurant,Mediterranean Restaurant,New American Restaurant
3,Claremont,Fast Food Restaurant,American Restaurant,Chinese Restaurant,Latin American Restaurant,Vietnamese Restaurant
4,Clarendon,American Restaurant,Vietnamese Restaurant,Persian Restaurant,Eastern European Restaurant,French Restaurant
5,Columbia Heights,Thai Restaurant,American Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Fast Food Restaurant
6,Garden City,Indian Restaurant,Mexican Restaurant,Thai Restaurant,Szechuan Restaurant,Sushi Restaurant
7,High View Park,Fast Food Restaurant,Afghan Restaurant,American Restaurant,Indian Restaurant,Italian Restaurant
8,Lyon Park,Korean Restaurant,Chinese Restaurant,South American Restaurant,Indian Restaurant,Mediterranean Restaurant
9,Pentagon City,Vietnamese Restaurant,Seafood Restaurant,Mediterranean Restaurant,Middle Eastern Restaurant,Portuguese Restaurant


### Clustering

In [20]:
# K-Means Clustering

kclusters = 5

ava_grouped_clustering = ava_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ava_grouped_clustering)

In [21]:
# Build final dataframe that contains cluster info

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ava_final = ava_filtered[ava_filtered['Neighborhood'].isin(select_neighborhoods)]

ava_final = ava_final.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

ava_final.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
4,Ballston,38.882,-77.1115,1,American Restaurant,Mexican Restaurant,Mediterranean Restaurant,Restaurant,Indian Restaurant
10,Buckingham,38.8734,-77.1066,4,Latin American Restaurant,Mexican Restaurant,Chinese Restaurant,Middle Eastern Restaurant,Mediterranean Restaurant
11,Carlin Springs,38.8772,-77.1118,1,American Restaurant,Mexican Restaurant,Fast Food Restaurant,Mediterranean Restaurant,New American Restaurant
13,Claremont,38.8432,-77.1047,0,Fast Food Restaurant,American Restaurant,Chinese Restaurant,Latin American Restaurant,Vietnamese Restaurant
14,Clarendon,38.8871,-77.0952,1,American Restaurant,Vietnamese Restaurant,Persian Restaurant,Eastern European Restaurant,French Restaurant


## Discussion

### Map Cluster

In [22]:
# Arlington,VA Lat/Long
latitude = 38.8816
longitude = -77.0910 

map_clusters = folium.Map(location = [latitude, longitude], zoom_start = 13)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(ava_final['Latitude'], ava_final['Longitude'], ava_final['Neighborhood'], ava_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color = rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters

### Examine Clusters

In [27]:
# First Cluster
ava_final.loc[ava_final['Cluster Labels']==0, ava_final.columns[[0] + list(range(4, ava_final.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
13,Claremont,Fast Food Restaurant,American Restaurant,Chinese Restaurant,Latin American Restaurant,Vietnamese Restaurant
30,High View Park,Fast Food Restaurant,Afghan Restaurant,American Restaurant,Indian Restaurant,Italian Restaurant


In [28]:
# Second Cluster
ava_final.loc[ava_final['Cluster Labels']==1, ava_final.columns[[0] + list(range(4, ava_final.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
4,Ballston,American Restaurant,Mexican Restaurant,Mediterranean Restaurant,Restaurant,Indian Restaurant
11,Carlin Springs,American Restaurant,Mexican Restaurant,Fast Food Restaurant,Mediterranean Restaurant,New American Restaurant
14,Clarendon,American Restaurant,Vietnamese Restaurant,Persian Restaurant,Eastern European Restaurant,French Restaurant
37,Pentagon City,Vietnamese Restaurant,Seafood Restaurant,Mediterranean Restaurant,Middle Eastern Restaurant,Portuguese Restaurant
47,Westover,Thai Restaurant,American Restaurant,Middle Eastern Restaurant,Chinese Restaurant,Italian Restaurant


In [29]:
# Third Cluster
ava_final.loc[ava_final['Cluster Labels']==2, ava_final.columns[[0] + list(range(4, ava_final.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
42,Rosslyn,Mediterranean Restaurant,Portuguese Restaurant,Vegetarian / Vegan Restaurant,Japanese Restaurant,Mexican Restaurant


In [30]:
# Fourth Cluster
ava_final.loc[ava_final['Cluster Labels']==3, ava_final.columns[[0] + list(range(4, ava_final.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
28,Garden City,Indian Restaurant,Mexican Restaurant,Thai Restaurant,Szechuan Restaurant,Sushi Restaurant
32,Lyon Park,Korean Restaurant,Chinese Restaurant,South American Restaurant,Indian Restaurant,Mediterranean Restaurant
45,Virginia Square,Afghan Restaurant,Middle Eastern Restaurant,Chinese Restaurant,Peruvian Restaurant,Fast Food Restaurant


In [31]:
# Fifth Cluster
ava_final.loc[ava_final['Cluster Labels']==4, ava_final.columns[[0] + list(range(4, ava_final.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
10,Buckingham,Latin American Restaurant,Mexican Restaurant,Chinese Restaurant,Middle Eastern Restaurant,Mediterranean Restaurant
16,Columbia Heights,Thai Restaurant,American Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Fast Food Restaurant
40,Randolph Square,American Restaurant,Mexican Restaurant,Ramen Restaurant,Italian Restaurant,Indian Restaurant
43,Shirlington,American Restaurant,Mexican Restaurant,Ramen Restaurant,Italian Restaurant,Indian Restaurant
46,Westmont,Mexican Restaurant,Thai Restaurant,Fast Food Restaurant,American Restaurant,Ethiopian Restaurant
