# Capstone Project
# Determining a suitable location for a new Chinese restaurant in Lincoln, Nebraska

In [1]:
import math
import pandas as pd
import numpy as np
import json, requests #these will help us to parse json responses from the FourSquare api
import folium #map builder
import matplotlib
import matplotlib.pyplot as plt #used to visualize the data
from sklearn.cluster import KMeans #for clustering our data
from geopy.geocoders import Nominatim #used to turn addresses into coordinate values

## Define variables and functions

In [2]:
CLIENT_ID = 'AOYVG5XV3PJOIB35CYPLZFPP1KY0E3VR01UJQELALMEM5PFB' # your Foursquare ID
CLIENT_SECRET = 'UMBXGRZVABRHGXFWOHQ3UKHSILED4PZZOOTVESVR3CXXICK3' # your Foursquare Secret
VERSION = '20180604'

In [21]:
#function to convert address to latitude and longitude
def get_coords(address):
    geolocator = Nominatim(user_agent="foursquare_agent")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude, longitude

#function to return FourSquare api response results
def get_results(lat, lon, radius):
    url = 'https://api.foursquare.com/v2/venues/explore'

    params = dict(
      client_id= CLIENT_ID,
      client_secret=CLIENT_SECRET,
      v='20180323',
      ll= str(lat) + ', ' + str(lon),
      limit=300,
        radius=radius
    )
    resp = requests.get(url=url, params=params)
    results = json.loads(resp.text)
    if results['response'].get('groups',0) != 0:
        results = results['response']['groups'][0]['items']
    else:
        results = []
    return results

#function to retrieve counts of venues by category from FourSquare api
#populates a given dictionary with the region name and counts for all venue categories returned from FourSquare explore api
def get_category_counts(region_name, lat, lon, radius, region_dict):
    results = get_results(lat, lon, radius)
    region_dict[region_name] = {'latitude':lat, 'longitude':lon, 'radius':radius }
    if results != []:
#        top_lvl_cats = [result['venue']['categories'][0]['icon']['prefix'].split('/')[5] for result in results]
#        for cat in set(top_lvl_cats):
#            region_dict[region_name]['cat_' + cat] = top_lvl_cats.count(cat)
        cats = [result['venue']['categories'][0]['shortName'] for result in results]
        for cat in set(cats):
            region_dict[region_name][cat] = cats.count(cat)

#function to build a grid of coordinates in a square around a center coordinate pair
#returns a list of all grid point coordinates and the NE,SW bounds of the overall grid
def get_grid(latitude, longitude, grid_pts, spread): 
    alphas = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    width = (grid_pts-1)*spread
    lat_offset = (width * 0.014493) / 2
    long_offset = (width/(math.cos(math.radians(latitude)) * 69.172))/2
    min_lat = latitude - lat_offset
    max_lat = latitude + lat_offset
    min_long = longitude + long_offset
    max_long = longitude - long_offset
    lat_degs = (max_lat - min_lat)/(grid_pts - 1)
    long_degs = (max_long - min_long)/(grid_pts - 1)
    grid_coords = []
    radius = int(((width * 1609.34)/(grid_pts-1))/2)
    for i in range(0,grid_pts):
        for j in range(0,grid_pts):
            name = alphas[i] + str(grid_pts - j)
            cur_lat = max_lat - (i * lat_degs)
            cur_long = min_long + (j * long_degs)
            grid_coords.append([name, cur_lat, cur_long, radius])
    return grid_coords,[[max_lat,max_long],[min_lat,min_long]]

## Retrieve geographic and FourSquare data

In [4]:
#get the latitude and longitude of the geographic center of the target city
address = '2100 Randolph Street, Lincoln, NE'
latitude, longitude = get_coords(address)
print('Latitude = {}, Longitude={}'.format(latitude,longitude))

Latitude = 40.806209, Longitude=-96.6823237


In [40]:
#get coordinates for a 26x26 grid of points 0.5 miles apart (13mi.x13mi), centered on the city center coordinates
pts, bounds = get_grid(latitude, longitude, 26, 0.5)

#populate a dictionary with the counts for each returned venue type in each grid point
region_dict = {}
for ix, pt in enumerate(pts):
    get_category_counts(pt[0], pt[1], pt[2], pt[3] * 1.5,region_dict)

In [32]:
#convert the venue counts to a dataframe and process it so we can further manipulate and analyze the data
grid_df = pd.DataFrame.from_dict(region_dict)
grid_df = grid_df.transpose()

#remove null values
grid_df.fillna(0,inplace=True)

#update the index and remove unnecessary columns
grid_df.index.name = 'region'
print(grid_df.shape)
grid_df.head()

(676, 306)


Unnamed: 0_level_0,ATM,Accessories,Adult Boutique,Airport,Airport Service,American,Antiques,Apparel,Arcade,Art Gallery,...,Wine Shop,Wings,Women's Store,Yoga Studio,Yogurt,Zoo,Zoo Exhibit,latitude,longitude,radius
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.89679,-96.562953,603.0
A25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.89679,-96.572503,603.0
A24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.89679,-96.582052,603.0
A23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.89679,-96.591602,603.0
A22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.89679,-96.601152,603.0


In [33]:
#list of columns to exclude for some of the following steps
strip_cols = ['latitude','longitude','radius','total_venues','cluster','Intersection']

#add a column with the sum of all venues for each record
grid_df['total_venues'] = grid_df[grid_df.columns.difference(strip_cols)].sum(axis=1)
grid_df.head()

Unnamed: 0_level_0,ATM,Accessories,Adult Boutique,Airport,Airport Service,American,Antiques,Apparel,Arcade,Art Gallery,...,Wings,Women's Store,Yoga Studio,Yogurt,Zoo,Zoo Exhibit,latitude,longitude,radius,total_venues
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,40.89679,-96.562953,603.0,0.0
A25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,40.89679,-96.572503,603.0,1.0
A24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,40.89679,-96.582052,603.0,1.0
A23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,40.89679,-96.591602,603.0,0.0
A22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,40.89679,-96.601152,603.0,1.0


## Analyze the data

In [34]:
#Get the top 20 venue types for the entire grid
grid_df[grid_df.columns.difference(strip_cols)].sum(axis=0).sort_values(ascending=False)[:20]

Fast Food            169.0
Sandwiches           140.0
Pizza                128.0
Convenience Store    110.0
Mexican               87.0
Coffee Shop           87.0
Park                  86.0
Bar                   83.0
American              77.0
Hotel                 70.0
Pharmacy              67.0
Gym / Fitness         64.0
Grocery Store         58.0
Chinese               57.0
Gas Station           54.0
Construction          48.0
Spa                   44.0
Gym                   42.0
Burgers               40.0
Ice Cream             39.0
dtype: float64

### Chinese venues are the sixth most frequent food-related venues in Lincoln:
Fast Food  
Sandwiches  
Pizza  
Mexican  
American
#### Chinese
Burgers  
Ice Cream


## Clustering the data based on FourSquare venue counts
Clustering the data should provide us with a useful way to categorize the different areas of the city.  
We can use this information to determine which areas should be most suited to starting a Chinese restaurant.

In [35]:
# set number of clusters
kclusters = 6
vens_df = grid_df[grid_df.columns.difference(['total_venues'])]
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(vens_df)

# check cluster labels generated for each row in the dataframe
grid_df['cluster'] = kmeans.labels_

### Analyzing the clusters

In [36]:
#get the average count of venues for each category grouped by cluster (excluding columns of non-critical data)
group_means = grid_df[grid_df.columns.difference(['latitude',
                                                  'longitude',
                                                  'radius',
                                                  'total_venues',
                                                  'Intersection',
                                                  'Construction'])].groupby('cluster').mean().transpose()

#print out the top 10 venues by average for each cluster
for i in range(0,kclusters):
    print(group_means[[i]].nlargest(10,i))
    print()

cluster                   0
Fast Food          1.524590
Sandwiches         0.819672
Pizza              0.721311
American           0.606557
Pharmacy           0.573770
Hotel              0.557377
Convenience Store  0.540984
Chinese            0.491803
Mexican            0.491803
Grocery Store      0.442623

cluster                1
Park               0.068
Bar                0.066
Fast Food          0.056
Baseball Field     0.054
Pizza              0.054
Gym / Fitness      0.048
Convenience Store  0.046
Golf Course        0.046
Lake               0.046
Sandwiches         0.038

cluster         2
Bar           7.0
Sandwiches    6.5
Pizza         3.5
Brewery       2.5
Burgers       2.5
Coffee Shop   2.5
Mexican       2.5
Cocktail      2.0
Concert Hall  2.0
Hotel         2.0

cluster             3
Apparel           6.5
Women's Store     3.0
Department Store  2.0
Lingerie          2.0
Mexican           2.0
Shoes             2.0
Accessories       1.5
Gift Shop         1.5
Pharmacy          

### We can make some observations about the clusters based on their most common venues

### Cluster 0:
Tier 3 commercial<br>
Low-Medium venue density<br>
Primarily Food and Service venues

### Cluster 1:
Tier 2 Residential/Rural<br>
Low venue density<br>
Primarily recreation/housing

### Cluster 2:
Tier 1 commercial<br>
High venue density<br>
Primarily Bars/Food/Entertainment

### Cluster 3:
Tier 1 commercial<br>
High venue density<br>
Primarily Shopping/Food

### Cluster 4:
Tier 2 commercial<br>
Medium venue density<br>
Primarily Food

### Cluster 5:
Tier 1 Residential<br>
Low venue density<br>
Primarily Housing

### Because of the low venue density and primary venue types in clusters 1 and 5, they most likely won't provide good locations for a Chinese restaurant.  We will exclude these clusters from consideration.

## Map the grid regions included in the remaining clusters

In [37]:
#build the map 
map_grid = folium.Map(location=[latitude, longitude], zoom_start=11)
map_grid.fit_bounds(bounds)

#define cluster colors
clrs = ['green','white','blue','yellow','purple','white']
rects = []
#add the grid to the map and color code based on the cluster (exclude clusters 1 and 5)
for ix, region in enumerate(grid_df.index):
    lat = grid_df['latitude'][ix]
    lng = grid_df['longitude'][ix]
    radius = grid_df['radius'][ix]
    ven_count = grid_df['total_venues'][ix]
    cluster = grid_df['cluster'][ix]
    color = clrs[cluster]
    if grid_df['Chinese'][ix] == 0: #include a border if the region includes a Chinese venue
        stroke = False
    else:
        stroke = True
    if cluster not in (1,5): 
        _,bnds = get_grid(lat,lng,2,radius*0.000621371 * 1.3)
        label = folium.Popup(region + ' Venues:' + str(ven_count) + ' Cluster:' + str(cluster), parse_html=True)
        rect = folium.Rectangle(
            bounds=bnds,
            popup=label,
            color='red',
            fill=True,
            fill_color=color,
            fill_opacity=0.7,
            stroke=stroke,
            parse_html=False)
        rect.add_to(map_grid)
        rects.append(rect)

map_grid

## Analyzing the map  
The map makes it easy to see the grid regions in the remaining clusters.   
Regions with a Red border are those which already have at least one Chinese venue.  
The Blue regions identify the downtown area which is mostly bars, restaurants, and entertainment.  
The largest mall in the city is located within the Yellow area.  
The Green and Purple regions indicate smaller commercial areas, Purple regions have higher venue density than Green regions. 

The resulting map clearly shows that the Southwest corner of town has the fewest Chinese venues.  
The area also has a large group of grid regions which fall into Cluster 0 (Green) which means that they would likely support a Chinese venue.  
On closer inspection, we can see that the area on Pine Lake Road between 14th street and 27th street looks the most promising.    
Within this area there is a mid-sized shopping mall, grocery, and fast food.  
It appears to be a thriving commercial area that is only missing one thing, a new Chinese restaurant.