# IBM Data Science Capstone Project Final Notebook

## Topic: Finding out which Neighborhoods of any two Cities are Similar to each other!

#### As maps are not visible on GitHub, here's the link to the same notebook on IBM Watson: [Notebook](https://dataplatform.cloud.ibm.com/analytics/notebooks/v2/3c6ced66-7877-44c3-9462-8bcc599966e7/view?access_token=3d96fa85bda97b0612a646c83bd3a91d23ae5404f52d6612a15d4bb37cd1c615)



In [1]:
import numpy as np
import pandas as pd
#pd.set_option("display.max_columns", None)
#pd.set_option("display.max_rows", None)
import json
from geopy.geocoders import Nominatim
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import folium
print("Libraries imported.")

Libraries imported.


## Initiate FourSquare API Credentials

In [2]:
CLIENT_ID = 'FRUG1QP2AWAUDZ3VPAHTTYPTCI1SE2PDABPIXDZZ22N4MVII' # your Foursquare ID
CLIENT_SECRET = 'IWZMB4QOMBYPWEP4JFXENLBL3FX2LBRE2FQMBWFJOQYOMWAJ' # your Foursquare Secret
VERSION = '20190801'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FRUG1QP2AWAUDZ3VPAHTTYPTCI1SE2PDABPIXDZZ22N4MVII
CLIENT_SECRET:IWZMB4QOMBYPWEP4JFXENLBL3FX2LBRE2FQMBWFJOQYOMWAJ


# Toronto Data

## Scrape Toronto Zip Code Data From Wikipedia

In [3]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')

In [4]:
postalCodeList = []
boroughList = []
neighborhoodList = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n'))

toronto_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

print(toronto_df.shape)
toronto_df.head()

(288, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Drop "Not Assigned" Boroughs and Group Neighborhoods in the same Borough

In [5]:
toronto_df = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df = toronto_df.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
for index, row in toronto_df.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
print(toronto_df.shape)
toronto_df.head()

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Load and append the co-ordinates from the provided CSV file, drop Postal Code

In [6]:
coordinates = pd.read_csv("Geospatial_Coordinates.csv")
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
toronto_df = toronto_df.merge(coordinates, on="PostalCode", how="left").drop('PostalCode',axis=1)
print(toronto_df.shape)
toronto_df.head()

(103, 4)


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476


# New York City Data

## Load NYC Data from JSON and convert into Pandas DF

In [7]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

neighborhoods_data = newyork_data['features']

column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
nyc_df = pd.DataFrame(columns=column_names)
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    nyc_df = nyc_df.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

print(nyc_df.shape)
nyc_df.head()

(306, 4)


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


# Toronto Venues

## Query the FourSquare API to obtain 500 venues within 500m of each Toronto Neighborhood

In [8]:
radius = 500
LIMIT = 500
venues = []

for lat, long, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

toronto_venues_df = pd.DataFrame(venues)
toronto_venues_df.columns = ['Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

toronto_venues_df.to_csv('toronto_venues_df.csv',index=False)
print(toronto_venues_df.shape)
toronto_venues_df.head()

(2244, 8)


Unnamed: 0,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


## Find the Top 25 Venue Categories for all of Toronto, this will be used later for reducing dimensions in Clustering

In [9]:
# toronto_venues_df = pd.read_csv('toronto_venues_df.csv')
toronto_top25_col = (pd.get_dummies(toronto_venues_df[['VenueCategory']], prefix="", prefix_sep="").mean().reset_index().sort_values(by=0,ascending=False)[:25].reset_index(drop=True).transpose().reset_index(drop=True)[:1]).values.tolist()[0]
# toronto_top25_col

## Summarize Categories at the Neighborhood Level

In [10]:
toronto_catdf = pd.get_dummies(toronto_venues_df[['VenueCategory']], prefix="", prefix_sep="")

toronto_catdf['Borough'] = toronto_venues_df['Borough'] 
toronto_catdf['Neighborhoods'] = toronto_venues_df['Neighborhood'] 
toronto_catdf['Lat'] = toronto_venues_df['BoroughLatitude']
toronto_catdf['Long'] = toronto_venues_df['BoroughLongitude']

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_catdf.columns[-4:]) + list(toronto_catdf.columns[:-4])
# fixed_columns = list(toronto_catdf.columns[-4:]) + common_top
fixed_columns
toronto_catdf = toronto_catdf[fixed_columns]

toronto_catdf = toronto_catdf.groupby(["Borough", "Neighborhoods", "Lat", "Long"]).mean().reset_index()

print(toronto_catdf.shape)
toronto_catdf.head()

(101, 284)


Unnamed: 0,Borough,Neighborhoods,Lat,Long,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,Davisville,43.704324,-79.38879,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Central Toronto,Davisville North,43.712751,-79.390197,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0
3,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Toronto,Lawrence Park,43.72802,-79.38879,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# NYC Venues

## Query the FourSquare API to obtain 500 venues within 500m of each NYC Neighborhood

In [11]:
radius = 500
LIMIT = 500
venues = []

for lat, long, borough, neighborhood in zip(nyc_df['Latitude'], nyc_df['Longitude'], nyc_df['Borough'], nyc_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

nyc_venues_df = pd.DataFrame(venues)
nyc_venues_df.columns = ['Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

nyc_venues_df.to_csv('nyc_venues_df.csv',index=False)
print(nyc_venues_df.shape)
nyc_venues_df.head()

(10401, 8)


Unnamed: 0,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Bronx,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Bronx,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
2,Bronx,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
3,Bronx,Wakefield,40.894705,-73.847201,Cooler Runnings Jamaican Restaurant Inc,40.898276,-73.850381,Caribbean Restaurant
4,Bronx,Wakefield,40.894705,-73.847201,Dunkin',40.890459,-73.849089,Donut Shop


## Find the Top 25 Venue Categories for all of NYC, this will be used later for reducing dimensions in Clustering

In [12]:
# nyc_venues_df = pd.read_csv('nyc_venues_df.csv')
nyc_top25_col = (pd.get_dummies(nyc_venues_df[['VenueCategory']], prefix="", prefix_sep="").mean().reset_index().sort_values(by=0,ascending=False)[:25].reset_index(drop=True).transpose().reset_index(drop=True)[:1]).values.tolist()[0]
# nyc_top25_col

## Summarize Categories at the Neighborhood Level

In [13]:
nyc_catdf = pd.get_dummies(nyc_venues_df[['VenueCategory']], prefix="", prefix_sep="")

nyc_catdf['Borough'] = nyc_venues_df['Borough'] 
nyc_catdf['Neighborhoods'] = nyc_venues_df['Neighborhood'] 
nyc_catdf['Lat'] = nyc_venues_df['BoroughLatitude']
nyc_catdf['Long'] = nyc_venues_df['BoroughLongitude']

# move postal, borough and neighborhood column to the first column
fixed_columns = list(nyc_catdf.columns[-4:]) + list(nyc_catdf.columns[:-4])
# fixed_columns = list(nyc_catdf.columns[-4:]) + common_top
fixed_columns
nyc_catdf = nyc_catdf[fixed_columns]

nyc_catdf = nyc_catdf.groupby(["Borough", "Neighborhoods", "Lat", "Long"]).mean().reset_index()

print(nyc_catdf.shape)
nyc_catdf.head()

(305, 432)


Unnamed: 0,Borough,Neighborhoods,Lat,Long,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,...,Warehouse Store,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Bronx,Allerton,40.865788,-73.859319,0.0,0.0,0.0,0.0,0.0,0.030303,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bronx,Baychester,40.866858,-73.835798,0.0,0.0,0.0,0.0,0.0,0.083333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bronx,Bedford Park,40.870185,-73.885512,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bronx,Belmont,40.857277,-73.888452,0.0,0.0,0.0,0.0,0.0,0.010204,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010204,0.0,0.0
4,Bronx,Bronxdale,40.852723,-73.861726,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Use the Top 25 Venue Categories from both Cities and combine into a single list. Only keep these categories (39 unique) for both cities and combine the two datasets. Add city identifier.

In [14]:
common_top = list(set(nyc_top25_col+toronto_top25_col))
catdf = toronto_catdf.append(nyc_catdf)[list(nyc_catdf.columns[:4]) + common_top]
catdf['City'] = catdf['Lat'].apply(lambda x: 'Toronto' if x > 42 else 'NYC')
print(catdf.shape)
catdf.sort_values(by='Neighborhoods').head(10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


(406, 40)


Unnamed: 0,Borough,Neighborhoods,Lat,Long,Mexican Restaurant,Bank,Steakhouse,Sushi Restaurant,Grocery Store,American Restaurant,...,Donut Shop,Bagel Shop,Clothing Store,Deli / Bodega,Coffee Shop,Gastropub,Pub,Burger Joint,Hotel,City
9,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,0.0,0.0,0.04,0.02,0.0,0.03,...,0.0,0.0,0.01,0.01,0.08,0.02,0.0,0.02,0.03,Toronto
74,Scarborough,Agincourt,43.7942,-79.262029,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Toronto
75,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Toronto
37,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,0.0,0.0,0.0,0.0,0.222222,0.0,...,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,Toronto
38,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.0,Toronto
0,Bronx,Allerton,40.865788,-73.859319,0.0,0.0,0.0,0.0,0.030303,0.030303,...,0.030303,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,NYC
243,Staten Island,Annadale,40.538114,-74.178549,0.0,0.0,0.0,0.0,0.0,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,NYC
244,Staten Island,Arden Heights,40.549286,-74.185887,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.0,NYC
245,Staten Island,Arlington,40.635325,-74.165104,0.0,0.0,0.0,0.0,0.0,0.166667,...,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,NYC
246,Staten Island,Arrochar,40.596313,-74.067124,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.055556,0.0,0.111111,0.0,0.0,0.0,0.0,0.055556,NYC


# Clustering

## Cluster neighborhoods in these 2 cities into 3 groups

In [15]:
kclusters = 3

clustering = catdf.drop(["Borough", "Neighborhoods", "Lat", "Long", "City"], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(clustering)

kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [16]:
catdf_clust = catdf.copy()

# add clustering labels
catdf_clust["ClusterLabel"] = kmeans.labels_

fixed_columns = list(catdf_clust.columns[-2:]) + list(catdf_clust.columns[:-2])
catdf_clust = catdf_clust[fixed_columns]

print(catdf_clust.shape)
catdf_clust.head()

(406, 41)


Unnamed: 0,City,ClusterLabel,Borough,Neighborhoods,Lat,Long,Mexican Restaurant,Bank,Steakhouse,Sushi Restaurant,...,Bar,Donut Shop,Bagel Shop,Clothing Store,Deli / Bodega,Coffee Shop,Gastropub,Pub,Burger Joint,Hotel
0,Toronto,0,Central Toronto,Davisville,43.704324,-79.38879,0.0,0.0,0.0,0.057143,...,0.0,0.0,0.0,0.0,0.0,0.057143,0.0,0.0,0.0,0.0
1,Toronto,0,Central Toronto,Davisville North,43.712751,-79.390197,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111
2,Toronto,0,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,0.0,0.0,0.0,0.066667,...,0.0,0.0,0.066667,0.0,0.0,0.133333,0.0,0.133333,0.0,0.0
3,Toronto,0,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Toronto,0,Central Toronto,Lawrence Park,43.72802,-79.38879,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Show Borough distribution by City and Cluster

In [17]:
catdf_clust.groupby(['City','ClusterLabel']).count()['Borough']

City     ClusterLabel
NYC      0               185
         1               115
         2                 5
Toronto  0                77
         1                13
         2                11
Name: Borough, dtype: int64

## Plot the 3 clusters in these 2 cities on a single Map

In [18]:
avg_lat = 42.3
avg_long = -76.7
map_clusters = folium.Map(location=[avg_lat, avg_long], zoom_start=7)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, bor, poi, cluster in zip(catdf_clust['Lat'], catdf_clust['Long'], catdf_clust['Borough'], catdf_clust['Neighborhoods'], catdf_clust['ClusterLabel']):
    label = folium.Popup('{} : {} - Cluster {}'.format(bor, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Get the Top 10 Categories for each Area

In [19]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['City', 'ClusterLabel' , 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
catdf_sorted = pd.DataFrame(columns=columns)
catdf_sorted['City'] = catdf_clust['City']
catdf_sorted['ClusterLabel'] = catdf_clust['ClusterLabel']
catdf_sorted['Borough'] = catdf_clust['Borough']
catdf_sorted['Neighborhoods'] = catdf_clust['Neighborhoods']


for ind in np.arange(catdf_clust.shape[0]):
    row_categories = catdf_clust.iloc[ind, :].iloc[5:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    catdf_sorted.iloc[ind, 4:] = row_categories_sorted.index.values[0:num_top_venues]

# neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
print(catdf_sorted.shape)
catdf_sorted.sort_values(by='Neighborhoods').head(10)

(406, 14)


Unnamed: 0,City,ClusterLabel,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Toronto,0,Downtown Toronto,"Adelaide, King, Richmond",Coffee Shop,Café,Bar,Steakhouse,Thai Restaurant,Hotel,American Restaurant,Breakfast Spot,Restaurant,Gym
74,Toronto,0,Scarborough,Agincourt,Chinese Restaurant,Sandwich Place,Breakfast Spot,Hotel,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Bakery
75,Toronto,2,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Hotel,Burger Joint,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot,Bakery
37,Toronto,1,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pharmacy,Coffee Shop,Sandwich Place,Fast Food Restaurant,Pizza Place,Bakery,Supermarket,Thai Restaurant,Restaurant
38,Toronto,1,Etobicoke,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Sandwich Place,Pharmacy,Pub,Bank,Steakhouse,Sushi Restaurant,Grocery Store
0,NYC,1,Bronx,Allerton,Pizza Place,Spa,Pharmacy,Supermarket,Fast Food Restaurant,Chinese Restaurant,Grocery Store,Donut Shop,American Restaurant,Deli / Bodega
243,NYC,0,Staten Island,Annadale,American Restaurant,Pub,Restaurant,Pizza Place,Park,Pharmacy,Bakery,Hotel,Breakfast Spot,Supermarket
244,NYC,1,Staten Island,Arden Heights,Pharmacy,Coffee Shop,Deli / Bodega,Pizza Place,Seafood Restaurant,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot
245,NYC,0,Staten Island,Arlington,American Restaurant,Coffee Shop,Deli / Bodega,Bakery,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot
246,NYC,1,Staten Island,Arrochar,Italian Restaurant,Deli / Bodega,Hotel,Bagel Shop,Sandwich Place,Pizza Place,Supermarket,Sushi Restaurant,Grocery Store,American Restaurant


# Cluster Analysis

## Cluster 1 mainly consists of places to hangout like Cafés, Coffee Shops and Bars

In [20]:
catdf_sorted.loc[catdf_sorted['ClusterLabel'] == 0].sort_values(by='Neighborhoods').head(20)

Unnamed: 0,City,ClusterLabel,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Toronto,0,Downtown Toronto,"Adelaide, King, Richmond",Coffee Shop,Café,Bar,Steakhouse,Thai Restaurant,Hotel,American Restaurant,Breakfast Spot,Restaurant,Gym
74,Toronto,0,Scarborough,Agincourt,Chinese Restaurant,Sandwich Place,Breakfast Spot,Hotel,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Bakery
243,NYC,0,Staten Island,Annadale,American Restaurant,Pub,Restaurant,Pizza Place,Park,Pharmacy,Bakery,Hotel,Breakfast Spot,Supermarket
245,NYC,0,Staten Island,Arlington,American Restaurant,Coffee Shop,Deli / Bodega,Bakery,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot
162,NYC,0,Queens,Arverne,Sandwich Place,Thai Restaurant,Coffee Shop,Donut Shop,Pizza Place,Hotel,Bakery,Supermarket,Restaurant,Fast Food Restaurant
163,NYC,0,Queens,Astoria,Bar,Seafood Restaurant,Bakery,Gym / Fitness Center,Grocery Store,Chinese Restaurant,Ice Cream Shop,Café,Italian Restaurant,Gym
165,NYC,0,Queens,Auburndale,Pharmacy,Italian Restaurant,Supermarket,Ice Cream Shop,Deli / Bodega,Fast Food Restaurant,American Restaurant,Thai Restaurant,Restaurant,Breakfast Spot
49,Toronto,0,North York,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Pharmacy,Grocery Store,Chinese Restaurant,Restaurant,Fast Food Restaurant,Sandwich Place,Pizza Place,Supermarket,Sushi Restaurant
122,NYC,0,Manhattan,Battery Park City,Park,Coffee Shop,Hotel,Clothing Store,Italian Restaurant,Gym,Sushi Restaurant,Grocery Store,Pizza Place,Sandwich Place
53,NYC,0,Brooklyn,Bay Ridge,Italian Restaurant,Spa,Pizza Place,American Restaurant,Bagel Shop,Bar,Chinese Restaurant,Sandwich Place,Grocery Store,Sushi Restaurant


## Cluster 2 mainly consists of places to eat like Pizza Places, Restaurants and Delis

In [21]:
catdf_sorted.loc[catdf_sorted['ClusterLabel'] == 1].sort_values(by='Neighborhoods').head(20)

Unnamed: 0,City,ClusterLabel,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,Toronto,1,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pharmacy,Coffee Shop,Sandwich Place,Fast Food Restaurant,Pizza Place,Bakery,Supermarket,Thai Restaurant,Restaurant
38,Toronto,1,Etobicoke,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Sandwich Place,Pharmacy,Pub,Bank,Steakhouse,Sushi Restaurant,Grocery Store
0,NYC,1,Bronx,Allerton,Pizza Place,Spa,Pharmacy,Supermarket,Fast Food Restaurant,Chinese Restaurant,Grocery Store,Donut Shop,American Restaurant,Deli / Bodega
244,NYC,1,Staten Island,Arden Heights,Pharmacy,Coffee Shop,Deli / Bodega,Pizza Place,Seafood Restaurant,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot
246,NYC,1,Staten Island,Arrochar,Italian Restaurant,Deli / Bodega,Hotel,Bagel Shop,Sandwich Place,Pizza Place,Supermarket,Sushi Restaurant,Grocery Store,American Restaurant
164,NYC,1,Queens,Astoria Heights,Bakery,Italian Restaurant,Supermarket,Pizza Place,Burger Joint,Sushi Restaurant,Grocery Store,American Restaurant,Seafood Restaurant,Park
52,NYC,1,Brooklyn,Bath Beach,Chinese Restaurant,Pizza Place,Pharmacy,Italian Restaurant,Donut Shop,Sushi Restaurant,Fast Food Restaurant,Deli / Bodega,Clothing Store,Coffee Shop
50,Toronto,1,North York,Bayview Village,Bank,Japanese Restaurant,Café,Chinese Restaurant,Hotel,Fast Food Restaurant,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant
2,NYC,1,Bronx,Bedford Park,Deli / Bodega,Supermarket,Pizza Place,Sandwich Place,Mexican Restaurant,Chinese Restaurant,Pub,Park,Donut Shop,Bar
169,NYC,1,Queens,Beechhurst,Supermarket,Pizza Place,Chinese Restaurant,Spa,Deli / Bodega,Donut Shop,Gym / Fitness Center,Italian Restaurant,Thai Restaurant,Restaurant


## Cluster 3 mainly consists of places surrounded by Parks

In [22]:
catdf_sorted.loc[catdf_sorted['ClusterLabel'] == 2].sort_values(by='Neighborhoods').head(20)

Unnamed: 0,City,ClusterLabel,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
75,Toronto,2,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Hotel,Burger Joint,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot,Bakery
168,NYC,2,Queens,Bayswater,Park,Hotel,Burger Joint,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot,Bakery
52,Toronto,2,North York,"CFB Toronto, Downsview East",Park,Hotel,Burger Joint,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot,Bakery
96,Toronto,2,York,Caledonia-Fairbanks,Park,Fast Food Restaurant,Hotel,Burger Joint,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Breakfast Spot,Bakery
8,NYC,2,Bronx,Clason Point,Park,Grocery Store,Hotel,Breakfast Spot,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Bakery
32,Toronto,2,East York,East Toronto,Park,Coffee Shop,Pizza Place,Bakery,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot
43,Toronto,2,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",Park,Hotel,Burger Joint,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot,Bakery
65,Toronto,2,North York,"Newtonbrook, Willowdale",Park,Hotel,Burger Joint,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot,Bakery
67,Toronto,2,North York,Parkwoods,Park,Fast Food Restaurant,Hotel,Burger Joint,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Breakfast Spot,Bakery
285,NYC,2,Staten Island,Randall Manor,Park,Bagel Shop,Bakery,Gym / Fitness Center,Supermarket,Thai Restaurant,Restaurant,Fast Food Restaurant,Breakfast Spot,Hotel


## In conclusion, Toronto and New York City do have areas with a lot in common. Someone looking to move from either city to another can easily find areas in the city similar to the ones in their current city.

In [23]:
catdf_sorted[['ClusterLabel','City','Neighborhoods']].loc[catdf_sorted['ClusterLabel']==2].sort_values(by=['ClusterLabel','Neighborhoods']).drop_duplicates().reset_index(drop=True)

Unnamed: 0,ClusterLabel,City,Neighborhoods
0,2,Toronto,"Agincourt North, L'Amoreaux East, Milliken, St..."
1,2,NYC,Bayswater
2,2,Toronto,"CFB Toronto, Downsview East"
3,2,Toronto,Caledonia-Fairbanks
4,2,NYC,Clason Point
5,2,Toronto,East Toronto
6,2,Toronto,"Kingsview Village, Martin Grove Gardens, Richv..."
7,2,Toronto,"Newtonbrook, Willowdale"
8,2,Toronto,Parkwoods
9,2,NYC,Randall Manor


## So from the example above, Somerville neighborhood in NYC is similar to Parkwoods in Toronto