Let's begin by loading in libraries necessary for our data analysis

In [16]:
#import standard libraries
import pandas as pd
import numpy as np

#import urllib and beautifulsoup for wiki webcrawler
import urllib.request
from bs4 import BeautifulSoup

#library to handle JSON files
import urllib.request, json

#geocoders to convert address into latitude and longitude
from geopy.geocoders import Nominatim 

#import library to compute max and min values of our multipolygon JSON
from shapely.geometry import shape, mapping

#library to handle requests
import requests 
#tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 

#Import map rendering library
import folium 

#Import k-means clustering 
from sklearn.cluster import KMeans


print('Libraries imported.')

Libraries imported.


Starting with New York City, lets import the neighborhood location data 

In [2]:
#load in the data from the download url
with urllib.request.urlopen("https://cocl.us/new_york_dataset") as url:
    jdata = json.loads(url.read().decode())
    #print(data)
    
#initialize dataframe
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
nycnb = pd.DataFrame(columns=column_names)

#loop through JSON and fill out data
for data in jdata['features']:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    nycnb = nycnb.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
    
nycnb.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


Next, lets narrow down our search to manhattan and brooklyn

In [3]:
nyc_man = nycnb[nycnb['Borough'] == 'Manhattan'].reset_index(drop=True)
nyc_bk = nycnb[nycnb['Borough'] == 'Brooklyn'].reset_index(drop=True)
nyc_data = nyc_man.append(nyc_bk)
nyc_data.reset_index(drop = True, inplace = True)
nyc_data

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.910660
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.936900
3,Manhattan,Inwood,40.867684,-73.921210
4,Manhattan,Hamilton Heights,40.823604,-73.949688
...,...,...,...,...
105,Brooklyn,Dumbo,40.703176,-73.988753
106,Brooklyn,Homecrest,40.598525,-73.959185
107,Brooklyn,Highland Park,40.681999,-73.890346
108,Brooklyn,Madison,40.609378,-73.948415


Now lets load in the data location data for Toronto Neighborhoods. To do this we will web-crawl a wikipedia page

In [4]:
#define the website to scrape 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)

#load the html
soup = BeautifulSoup(page, "lxml")

#locate the table we want to retrieve 
tables = soup.find_all("table")
table = soup.find('table', class_ = 'wikitable sortable')

A = []
B = []
C = []

#loop through table and store columns in arrays
for row in table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells) == 3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

#create dataframe and transfer data        
df = pd.DataFrame(A,columns=['Postal Code'])
df['Borough'] = B
df['Neighborhood'] = C

#drop rows with columns 'not assigned'
missing = df[df['Borough']=='Not assigned\n'].index
missing
df.drop(missing, inplace = True)

#load in the neighborhood coordinate data
coords = pd.read_csv('http://cocl.us/Geospatial_data')
coords.shape

df['Postal Code'] = df['Postal Code'].str.strip()
coords['Postal Code'] = coords['Postal Code'].str.strip()
tnt_data = pd.merge(df,coords, on='Postal Code')

#drop the postal code column
tnt_data.drop(columns = ['Postal Code'], inplace = True)
tnt_data

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,North York\n,Parkwoods\n,43.753259,-79.329656
1,North York\n,Victoria Village\n,43.725882,-79.315572
2,Downtown Toronto\n,"Regent Park, Harbourfront\n",43.654260,-79.360636
3,North York\n,"Lawrence Manor, Lawrence Heights\n",43.718518,-79.464763
4,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n",43.662301,-79.389494
...,...,...,...,...
98,Etobicoke\n,"The Kingsway, Montgomery Road, Old Mill North\n",43.653654,-79.506944
99,Downtown Toronto\n,Church and Wellesley\n,43.665860,-79.383160
100,East Toronto\n,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,Etobicoke\n,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


Now lets load in the data location data for Chicago Neighborhoods. To do this we import a csv file with all the coordinates

In [35]:
#first lets load the txt file with the multipolygon coordinate data for chicago neighborhoods
cdata = pd.read_csv('/Users/kbrannon/Downloads/parse-chicago-neighborhoods-master/community_to_gps.txt',sep ="\t")

#rename the columns of our dataframe
ccolumns = ['CA','Neighborhood','Latitude','Longitude']
cdata.columns = ccolumns
cdata.head()

Unnamed: 0,CA,Neighborhood,Latitude,Longitude
0,1,Rogers Park,42.003801,-87.657651
1,1,Rogers Park,42.002439,-87.657809
2,1,Rogers Park,41.99839,-87.657676
3,1,Rogers Park,42.009069,-87.661341
4,1,Rogers Park,42.00568,-87.660129


In [59]:
#define the dataframe columns
chi_columns = ['Neighborhood', 'Latitude', 'Longitude'] 

#create the empty dataframe
chc_data = pd.DataFrame(columns=chi_columns)

In [60]:
#loop through dataframe and find the center of each neighborhood
x = 1
while x < 78:
    #select the specific neighborhood
    df = cdata.loc[cdata['CA'] == x]
    name = df['Neighborhood'].unique()[0]
    
    #find the upper,lower,left, and right bounds of the neighborhood
    upper = max(df['Longitude'])
    lower = min(df['Longitude'])
    left = min(df['Latitude'])
    right = max(df['Latitude'])
    
    #calculate the center of each neighborhood
    neighborhood_lon = (upper + lower)/2
    neighborhood_lat = (left + right)/2
    
    #append the new data to our dataframe
    chc_data = chc_data.append({'Neighborhood': name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
    x = x + 1
    
chc_data.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Rogers Park,42.009377,-87.670152
1,West Ridge,42.001962,-87.690662
2,Uptown,41.965113,-87.657
3,Lincoln Square,41.972125,-87.687907
4,North Center,41.946898,-87.684309


Now that we have the neighborhood location data for all three cities, we need to merge them all into one dataframe. To do this, we will drop the borough column for New York City and Toronto, and will add a City column so we can keep track of which neighborhood is in which city.

In [61]:
#Drop the borough column for New York City and Toronto
nyc_data.drop(columns = ['Borough'], inplace = True)
tnt_data.drop(columns = ['Borough'], inplace = True)

In [65]:
#Add City column to each data frame 
nyc_data['City'] = 'New York'
tnt_data['City'] = 'Toronto'
chc_data['City'] = 'Chicago'

In [77]:
#Append the dataframe to create a master dataframe with all neighborhoods and locations
mdf = nyc_data.append([tnt_data,chc_data])
mdf.shape

(290, 4)

Now that we have a masterdataframe with all the neighborhood names, origin city, latitude, and longitude, we can make requests to the Foursquare API to get venues in each neighborhood. For this project, we are looking for venues within a 0.5 miles of each neighborhood (roughly 800 meters). We will set a limit of 100 venues for each request

In [94]:
#Initialize API Call with credentials
CLIENT_ID = '0FJGT5DZROG0WMMK3SOGFBSFJDFR3KFYSBKSBWKWRM3YDVLZ' 
CLIENT_SECRET = 'M203EQNGYAT3EBBBG0UC3FM5RXRNLIZIVYBZ0NHACC5GAEYJ' 
VERSION = '20180605' 

#Set radius and venue limit
LIMIT = 100 
radius = 500

#Define a function for retrieving venue data for each neighborhood
def Venues(names, latitudes, longitudes, city):
    #create empty array for venues
    venues_list=[]
    for name, lat, lng, cit in zip(names, latitudes, longitudes, city):
        print(name)    
        # structure request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make GET request
        datav = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng,
            cit,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in datav])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Latitude', 
                  'Longitude',
                    'City',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [97]:
#Now lets call our function, retrieve the data, and upload it to a new dataframe
neighborhood_venues = Venues(names=mdf['Neighborhood'],
                                   latitudes=mdf['Latitude'],
                                   longitudes=mdf['Longitude'], 
                                    city=mdf['City']
                                  )

#Examine new dataframe
neighborhood_venues.head()

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker Heights
Gerritsen Beach
Marine

Unnamed: 0,Neighborhood,Latitude,Longitude,City,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,New York,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,New York,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,40.876551,-73.91066,New York,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,40.876551,-73.91066,New York,Starbucks,40.877531,-73.905582,Coffee Shop
4,Marble Hill,40.876551,-73.91066,New York,Dunkin',40.877136,-73.906666,Donut Shop


Now that he have up to 100 venues for each neighborhood, lets check the number of venues in each as well as the number of unique venue categories.

In [125]:
neighborhood_venues['Neighborhood'].unique()

array(['Marble Hill', 'Chinatown', 'Washington Heights', 'Inwood',
       'Hamilton Heights', 'Manhattanville', 'Central Harlem',
       'East Harlem', 'Upper East Side', 'Yorkville', 'Lenox Hill',
       'Roosevelt Island', 'Upper West Side', 'Lincoln Square', 'Clinton',
       'Midtown', 'Murray Hill', 'Chelsea', 'Greenwich Village',
       'East Village', 'Lower East Side', 'Tribeca', 'Little Italy',
       'Soho', 'West Village', 'Manhattan Valley', 'Morningside Heights',
       'Gramercy', 'Battery Park City', 'Financial District',
       'Carnegie Hill', 'Noho', 'Civic Center', 'Midtown South',
       'Sutton Place', 'Turtle Bay', 'Tudor City', 'Stuyvesant Town',
       'Flatiron', 'Hudson Yards', 'Bay Ridge', 'Bensonhurst',
       'Sunset Park', 'Greenpoint', 'Gravesend', 'Brighton Beach',
       'Sheepshead Bay', 'Manhattan Terrace', 'Flatbush', 'Crown Heights',
       'East Flatbush', 'Kensington', 'Windsor Terrace',
       'Prospect Heights', 'Brownsville', 'Williamsburg', 'B

In [113]:
#check the number of venues in each neighborhood
neighborhood_venues.groupby('Neighborhood').count()

#check how many unique venue categories there are
print('There are {} uniques categories.'.format(len(neighborhood_venues['Venue Category'].unique())))

There are 445 uniques categories.


Now lets calculate the mean freuqency occurence of each type of venue for each neighborhood

In [132]:
# one hot encoding
venues_onehot = pd.get_dummies(neighborhood_venues[['Venue Category']], prefix="", prefix_sep="")

#drop neighborhood column
venues_onehot.drop(columns =['Neighborhood'], inplace = True)

#add neighborhood column back to dataframe
venues_onehot['Neighborhood'] = neighborhood_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [venues_onehot.columns[-1]] + list(venues_onehot.columns[:-1])
venues_onehot = venues_onehot[fixed_columns]

venues_onehot.shape

(9549, 445)

In [133]:
#calculate the mean frequency occurence 
venue_freq = venues_onehot.groupby('Neighborhood').mean().reset_index()
venue_freq

Unnamed: 0,Neighborhood,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Warehouse Store,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,Agincourt\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
1,Albany Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
2,"Alderwood, Long Branch\n",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
3,Archer Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
4,Armour Square,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,Woburn\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
273,Woodbine Heights\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
274,Woodlawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
275,York Mills West\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0


Now that we have the venue frequency for each neighborhood, lets define each neighborhood by its top 10 venue cateogries and put that information into a dataframe

In [201]:
#first, lets define a function to get the top 10 venues
def most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#now lets call the function and create the new dataframe
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = venue_freq['Neighborhood']

for ind in np.arange(venue_freq.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = most_common_venues(venue_freq.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt\n,Breakfast Spot,Lounge,Latin American Restaurant,Clothing Store,Skating Rink,Exhibit,Egyptian Restaurant,Electronics Store,Empanada Restaurant,English Restaurant
1,Albany Park,Mexican Restaurant,Bus Station,Bakery,Korean Restaurant,Chinese Restaurant,Grocery Store,Park,Asian Restaurant,Taco Place,Karaoke Bar
2,"Alderwood, Long Branch\n",Pizza Place,Pub,Sandwich Place,Coffee Shop,Gym,Pool,Dance Studio,Ethiopian Restaurant,Entertainment Service,Empanada Restaurant
3,Archer Heights,Mexican Restaurant,Bakery,Hotel,Discount Store,Fast Food Restaurant,Nightclub,Electronics Store,Bar,Bank,Sandwich Place
4,Armour Square,Chinese Restaurant,Cosmetics Shop,Italian Restaurant,Sandwich Place,Asian Restaurant,Gas Station,Grocery Store,Indian Restaurant,Hot Dog Joint,Filipino Restaurant


We have the top 10 venues for each neighborhood. Before we run a cluster analysis, we know that the executives would like both bars and a gym to be present nearby. Lets locate all of the neighborhoods that do not have bars in their top 10 most common venues.

In [192]:
neighborhoods_venues_sorted.columns

Index(['Neighborhood', '1st Most Common Venue', '2nd Most Common Venue',
       '3rd Most Common Venue', '4th Most Common Venue',
       '5th Most Common Venue', '6th Most Common Venue',
       '7th Most Common Venue', '8th Most Common Venue',
       '9th Most Common Venue', '10th Most Common Venue'],
      dtype='object')

In [226]:
#identify which neighborhoods have bars in the top 10 venues
refined_neighborhoods = neighborhoods_venues_sorted[((neighborhoods_venues_sorted['1st Most Common Venue'] == 'Bar')|(neighborhoods_venues_sorted['2nd Most Common Venue'] == 'Bar') | (neighborhoods_venues_sorted['3rd Most Common Venue'] == 'Bar') | (neighborhoods_venues_sorted['4th Most Common Venue'] == 'Bar') | (neighborhoods_venues_sorted['5th Most Common Venue'] == 'Bar')|(neighborhoods_venues_sorted['6th Most Common Venue'] == 'Bar')|(neighborhoods_venues_sorted['7th Most Common Venue'] == 'Bar')|(neighborhoods_venues_sorted['8th Most Common Venue'] == 'Bar')|(neighborhoods_venues_sorted['9th Most Common Venue'] == 'Bar')|(neighborhoods_venues_sorted['10th Most Common Venue'] == 'Bar'))]
indicies = pd.Series(refined_neighborhoods.index)

#Create a new frequency dataframe with only neighborhoods that have high prevalence of bars
bar_neighborhood = venue_freq.iloc[indicies].reset_index(drop = True)
bar_neighborhood


Unnamed: 0,Neighborhood,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Warehouse Store,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,Archer Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ashburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Avondale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bay Ridge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011628
4,Bedford Stuyvesant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.033333,0.033333,0.0,0.0,0.0,0.0
5,Boerum Hill,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.011494,0.0,0.0,0.0,0.022989
6,Bridgeport,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0
7,Bushwick,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.066667,0.066667,0.133333,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Carnegie Hill,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.011494,0.034483,0.0,0.0,0.0,0.034483


As we can see by the shape of our neighborhood, we have narrowed down our search to 57 neighborhoods. Now, lets segment these neighborhoods into clusters. Our client would like a recommendation of 3-5 neighborhoods so lets segment into 10 clusters

In [247]:
# set number of clusters
kclusters = 10

neighborhood_clusters = bar_neighborhood.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(neighborhood_clusters)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

#add cluster labels
refined_neighborhoods['Cluster Labels'] = kmeans.labels_


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [248]:
refined_neighborhoods.reset_index(drop = True, inplace = True)
refined_neighborhoods

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,7,Archer Heights,Mexican Restaurant,Bakery,Hotel,Discount Store,Fast Food Restaurant,Nightclub,Electronics Store,Bar,Bank,Sandwich Place
1,8,Ashburn,Light Rail Station,Snack Place,Bar,Pizza Place,Martial Arts Dojo,Automotive Shop,Egyptian Restaurant,Electronics Store,Empanada Restaurant,English Restaurant
2,1,Avondale,Chinese Restaurant,Park,Food Truck,Donut Shop,Supermarket,Bar,Light Rail Station,Electronics Store,Sandwich Place,Grocery Store
3,1,Bay Ridge,Italian Restaurant,Spa,Bagel Shop,Pizza Place,Greek Restaurant,American Restaurant,Bar,Grocery Store,Sandwich Place,Playground
4,4,Bedford Stuyvesant,Coffee Shop,Deli / Bodega,Café,Pizza Place,Bar,Boutique,BBQ Joint,Cocktail Bar,Tiki Bar,Thrift / Vintage Store
5,4,Boerum Hill,Coffee Shop,Dance Studio,Bar,Arts & Crafts Store,Bakery,Furniture / Home Store,French Restaurant,Sandwich Place,Spa,Gym / Fitness Center
6,1,Bridgeport,Chinese Restaurant,Mexican Restaurant,Bar,Grocery Store,Art Gallery,Bus Station,Coffee Shop,Basketball Court,Korean Restaurant,Automotive Shop
7,4,Bushwick,Deli / Bodega,Bar,Mexican Restaurant,Coffee Shop,Thrift / Vintage Store,Bakery,Discount Store,Pizza Place,Pharmacy,Vegetarian / Vegan Restaurant
8,4,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Sculpture Garden,Boat or Ferry,Coffee Shop,Airport,Airport Food Court,Airport Terminal,Bar,Rental Car Location
9,4,Carnegie Hill,Coffee Shop,Café,Italian Restaurant,Bookstore,Gym,Gym / Fitness Center,Yoga Studio,Wine Shop,Bar,Grocery Store


Now, let's examine all 10 clusters to determine which group has venues that our start-up executives would prefer for their office location. Remember, we are looking for neighborhoods with lots of fun venues (bars,restaurants, entertainment, gym, cafe, parks, etc). 

Cluster 1

In [238]:
refined_neighborhoods.loc[refined_neighborhoods['Cluster Labels'] == 0, refined_neighborhoods.columns[[1] + list(range(2, refined_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,East Garfield Park,Outdoor Sculpture,Stadium,Hockey Arena,Basketball Stadium,Sports Bar,Fried Chicken Joint,Sporting Goods Shop,Bar,Gastropub,Circus


Cluster 2

In [239]:
refined_neighborhoods.loc[refined_neighborhoods['Cluster Labels'] == 1, refined_neighborhoods.columns[[1] + list(range(2, refined_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Avondale,Chinese Restaurant,Park,Food Truck,Donut Shop,Supermarket,Bar,Light Rail Station,Electronics Store,Sandwich Place,Grocery Store
3,Bay Ridge,Italian Restaurant,Spa,Bagel Shop,Pizza Place,Greek Restaurant,American Restaurant,Bar,Grocery Store,Sandwich Place,Playground
6,Bridgeport,Chinese Restaurant,Mexican Restaurant,Bar,Grocery Store,Art Gallery,Bus Station,Coffee Shop,Basketball Court,Korean Restaurant,Automotive Shop
11,Central Harlem,African Restaurant,Gym / Fitness Center,Pizza Place,French Restaurant,Bar,Chinese Restaurant,Art Gallery,Seafood Restaurant,American Restaurant,Beer Bar
14,Ditmas Park,Chinese Restaurant,Pizza Place,Pharmacy,Donut Shop,Caribbean Restaurant,Burger Joint,Theater,Bagel Shop,Laundromat,Bar
21,Edison Park,Bar,Italian Restaurant,American Restaurant,Mexican Restaurant,Hot Dog Joint,Theater,Bakery,Liquor Store,French Restaurant,Breakfast Spot
23,Gerritsen Beach,Ice Cream Shop,Pizza Place,Bar,Bagel Shop,Harbor / Marina,Liquor Store,Gas Station,Event Space,Park,Department Store
24,Gowanus,Bar,Italian Restaurant,Furniture / Home Store,Chinese Restaurant,Mexican Restaurant,Food Truck,Coffee Shop,Pizza Place,Gym / Fitness Center,Art Gallery
25,Gramercy,Bar,Italian Restaurant,Coffee Shop,Pizza Place,Bagel Shop,Grocery Store,Mexican Restaurant,Playground,Cocktail Bar,American Restaurant
26,Gravesend,Lounge,Italian Restaurant,Bakery,Chinese Restaurant,Pizza Place,Liquor Store,Gym,Furniture / Home Store,Bar,Metro Station


Cluster 3

In [240]:
refined_neighborhoods.loc[refined_neighborhoods['Cluster Labels'] == 2, refined_neighborhoods.columns[[1] + list(range(2, refined_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
53,West Pullman,Convenience Store,Bar,Train Station,Grocery Store,Fish Market,Event Service,Duty-free Shop,Eastern European Restaurant,Egyptian Restaurant,Flea Market


Cluster 4

In [241]:
refined_neighborhoods.loc[refined_neighborhoods['Cluster Labels'] == 3, refined_neighborhoods.columns[[1] + list(range(2, refined_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
48,"Rouge Hill, Port Union, Highland Creek\n",Bar,Yoga Studio,Food & Drink Shop,Egyptian Restaurant,Electronics Store,Empanada Restaurant,English Restaurant,Entertainment Service,Ethiopian Restaurant,Event Service


Cluster 5

In [242]:
refined_neighborhoods.loc[refined_neighborhoods['Cluster Labels'] == 4, refined_neighborhoods.columns[[1] + list(range(2, refined_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Bedford Stuyvesant,Coffee Shop,Deli / Bodega,Café,Pizza Place,Bar,Boutique,BBQ Joint,Cocktail Bar,Tiki Bar,Thrift / Vintage Store
5,Boerum Hill,Coffee Shop,Dance Studio,Bar,Arts & Crafts Store,Bakery,Furniture / Home Store,French Restaurant,Sandwich Place,Spa,Gym / Fitness Center
7,Bushwick,Deli / Bodega,Bar,Mexican Restaurant,Coffee Shop,Thrift / Vintage Store,Bakery,Discount Store,Pizza Place,Pharmacy,Vegetarian / Vegan Restaurant
8,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Sculpture Garden,Boat or Ferry,Coffee Shop,Airport,Airport Food Court,Airport Terminal,Bar,Rental Car Location
9,Carnegie Hill,Coffee Shop,Café,Italian Restaurant,Bookstore,Gym,Gym / Fitness Center,Yoga Studio,Wine Shop,Bar,Grocery Store
10,Carroll Gardens,Italian Restaurant,Coffee Shop,Pizza Place,Cocktail Bar,Bakery,Bar,Wine Shop,Spa,Food & Drink Shop,Gym / Fitness Center
12,Cobble Hill,Playground,Bar,Coffee Shop,Pizza Place,Yoga Studio,Bakery,Cocktail Bar,Deli / Bodega,Italian Restaurant,Juice Bar
15,Downtown,Pizza Place,Coffee Shop,Burger Joint,Sandwich Place,Bar,French Restaurant,Middle Eastern Restaurant,Dance Studio,Performing Arts Venue,Cocktail Bar
19,East Village,Bar,Cocktail Bar,Pizza Place,Mexican Restaurant,Korean Restaurant,Coffee Shop,Ice Cream Shop,Wine Bar,Dessert Shop,Salon / Barbershop
20,East Williamsburg,Bar,Deli / Bodega,Cocktail Bar,Coffee Shop,Bakery,Concert Hall,Mexican Restaurant,Music Venue,Gym / Fitness Center,Vegetarian / Vegan Restaurant


Cluster 6

In [249]:
refined_neighborhoods.loc[refined_neighborhoods['Cluster Labels'] == 5, refined_neighborhoods.columns[[1] + list(range(2, refined_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,"Del Ray, Mount Dennis, Keelsdale and Silvertho...",Convenience Store,Bar,Sandwich Place,Coffee Shop,Discount Store,Exhibit,Egyptian Restaurant,Electronics Store,Empanada Restaurant,English Restaurant


Cluster 7

In [250]:
refined_neighborhoods.loc[refined_neighborhoods['Cluster Labels'] == 6, refined_neighborhoods.columns[[1] + list(range(2, refined_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
41,"Northwood Park, York University\n",Furniture / Home Store,Caribbean Restaurant,Bar,Miscellaneous Shop,Coffee Shop,Massage Studio,Metro Station,Yoga Studio,Eye Doctor,Empanada Restaurant


Cluster 8

In [251]:
refined_neighborhoods.loc[refined_neighborhoods['Cluster Labels'] == 7, refined_neighborhoods.columns[[1] + list(range(2, refined_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Archer Heights,Mexican Restaurant,Bakery,Hotel,Discount Store,Fast Food Restaurant,Nightclub,Electronics Store,Bar,Bank,Sandwich Place
18,East Side,Mexican Restaurant,ATM,Bar,Convenience Store,Pizza Place,Deli / Bodega,Pharmacy,Chinese Restaurant,Taco Place,Post Office


Cluster 9

In [252]:
refined_neighborhoods.loc[refined_neighborhoods['Cluster Labels'] == 8, refined_neighborhoods.columns[[1] + list(range(2, refined_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Ashburn,Light Rail Station,Snack Place,Bar,Pizza Place,Martial Arts Dojo,Automotive Shop,Egyptian Restaurant,Electronics Store,Empanada Restaurant,English Restaurant


Cluster 10

In [253]:
refined_neighborhoods.loc[refined_neighborhoods['Cluster Labels'] == 9, refined_neighborhoods.columns[[1] + list(range(2, refined_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,"Dufferin, Dovercourt Village\n",Bakery,Pharmacy,Supermarket,Bank,Bar,Grocery Store,Park,Café,Music Venue,Middle Eastern Restaurant


Now that we have broken our neighborhoods into clusters, we can see which cluster, and which neighborhoods within each cluster will be most appealing. 

Upon quick examination it is easy to see that clusters 2 and 5 are the best match, with the other clusters including undesired venues in their top 3 most common.

Before we dig through clusters 2 and 5, lets add some additional criteria. Executives would also like for there to be a park and cafe or coffee shop near the office. Lets narrow down our search 

In [271]:
#First, lets get only neighborhoods in cluster 2 and 5
neighborhood_final = refined_neighborhoods[((refined_neighborhoods['Cluster Labels'] == 4)|(refined_neighborhoods['Cluster Labels'] == 1))]
neighborhood_final.reset_index(drop=True, inplace = True)

In [274]:
#Next, lets get only neighborhoods with parks nearby
w_park = neighborhood_final[((neighborhood_final['1st Most Common Venue'] == 'Park')|(neighborhood_final['2nd Most Common Venue'] == 'Park') | (neighborhood_final['3rd Most Common Venue'] == 'Park') | (neighborhood_final['4th Most Common Venue'] == 'Park') | (neighborhood_final['5th Most Common Venue'] == 'Park')|(neighborhood_final['6th Most Common Venue'] == 'Park')|(neighborhood_final['7th Most Common Venue'] == 'Park')|(neighborhood_final['8th Most Common Venue'] == 'Park')|(neighborhood_final['9th Most Common Venue'] == 'Park')|(neighborhood_final['10th Most Common Venue'] == 'Park'))]
p_indicies = pd.Series(w_park.index)
p_indicies

0     0
1    16
2    17
3    23
4    38
dtype: int64

In [275]:
#Create a new frequency dataframe with only neighborhoods that have high prevalence of bars
park_n = neighborhood_final.iloc[p_indicies].reset_index(drop = True)
park_n

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,Avondale,Chinese Restaurant,Park,Food Truck,Donut Shop,Supermarket,Bar,Light Rail Station,Electronics Store,Sandwich Place,Grocery Store
1,4,Financial District,Coffee Shop,American Restaurant,Pizza Place,Bar,Sandwich Place,Cocktail Bar,Hotel,Park,Falafel Restaurant,Italian Restaurant
2,1,Gerritsen Beach,Ice Cream Shop,Pizza Place,Bar,Bagel Shop,Harbor / Marina,Liquor Store,Gas Station,Event Space,Park,Department Store
3,1,Irving Park,Bar,Breakfast Spot,Farmers Market,Latin American Restaurant,Park,Asian Restaurant,Donut Shop,Martial Arts Dojo,Café,Thai Restaurant
4,1,Red Hook,Seafood Restaurant,Art Gallery,Park,Bar,American Restaurant,Bagel Shop,Flower Shop,Farm,Café,Ice Cream Shop


Of all of the neighborhoods in both clusters, only 5 have parks in the top 10 most common venues. This greatly narrows down our search. Finally, taking into account that the executives also would like to have a cafe or coffee shop near the office, the nighborhoods Financial District in New York City, Gerritsen Beach in New York City, Irving Park in Chicago, and Red Hook in New York City take home the trophy in the battle of the neighborhoods. Toronto simply does not compete with Chicago or New York when it comes to what the start-up executives are looking for, and 3 or the 4 winning neighborhoods are in New York City. In conclusion of this project, we recommend New York City as their destination of choice due to the wider range of neighborhood selections to suit their tastes