In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import json

In [2]:
#Scrape clean data with Beautiful Soup and request for the url

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

page = requests.get(url).text

cdf = BeautifulSoup(page, 'lxml')

In [3]:
#retrieve only the postal codes table from the wikipedia page
datatable = cdf.find('table')
datatable;

In [4]:
#create a new pandas dataframe with the data from the wikipedia table 
headers = ['Postal Code', 'Borough', 'Neighborhood']
new_cdf = pd.DataFrame(columns=headers)
new_cdf;

In [5]:
#convert the table from html code to a pandas dataframe

for tr_cell in datatable.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        new_cdf.loc[len(new_cdf)] = row_data

In [6]:
new_cdf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
#get rid of all the not assigned boroughs from our table 
new_cdf= new_cdf[new_cdf['Borough']!='Not assigned'].reset_index(drop=True)
new_cdf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
#check if any neighborhoods are not assigned inside an assigned borough
display = new_cdf.loc[new_cdf['Neighborhood']=='Not assigned']
display.head()
#none

Unnamed: 0,Postal Code,Borough,Neighborhood


In [9]:
#appropriate code if a neighborhood was unassigned inside of a borough
#this line of code would make the not assigned neighborhood take on the name of the borough

new_cdf['Neighborhood'].replace('Not assigned',new_cdf['Borough'],inplace=True)
new_cdf.head();

In [10]:
new_cdf.rename(columns={'Postal Code':'PostalCode'},inplace=True)
new_cdf.head();

In [11]:
# DATAFRAME SIZE
new_cdf.shape

(103, 3)

In [12]:
#loaded the csv file containing longitude and latitude 
cdf_cordinates = pd.read_csv('http://cocl.us/Geospatial_data')
cdf_cordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
#used the join statement to add the latitude and longitude columns to the dataframe created in part 1
new_cdf = new_cdf.join(cdf_cordinates.set_index('Postal Code'), on='PostalCode')
new_cdf.columns
new_cdf.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [14]:
!conda install -c conda-forge geocoder --yes
import geocoder
from geopy.geocoders import Nominatim

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    openssl-1.1.1g             |       h516909a_1         2.1 MB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geocoder:        1.38.1-py_1       conda-forge
    python_abi:    

In [15]:
def get_geocode(postal_code):
    # initialized variable to None
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

In [16]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent='Toronto')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The cordinates of Toronto are {}, {}.'.format(latitude,longitude))

#these are the longitude and latitude values used for the url link a few cells below

The cordinates of Toronto are 43.6534817, -79.3839347.


In [99]:
#map the dataframe on a folium map using the longitudes and latitudes
#a popup is built in to display the borough and neighborhoods of the point

map_toronto = folium.Map(location=[latitude,longitude], zoom_start=10)

for lat,long,borough,neighborhood in zip(new_cdf['Latitude'],new_cdf['Longitude'],new_cdf['Borough'],new_cdf['Neighborhood']):
    label = '{}, {}'.format(neighborhood,borough)
    label=folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,long],
        radius = 3,
        popup = label,
        color = 'blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,).add_to(map_toronto)
map_toronto

In [17]:
CLIENT_ID = '1WWVRT2FB434MDFGNM2JGOPIZ0TSY1TKCNTJ2TYK52JIMGQA' # your Foursquare ID
CLIENT_SECRET = '1EAHYWCXXASFPLZ1CLTPIBYBSXSR4QGT4GBZMAMCWARSTQRP' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1WWVRT2FB434MDFGNM2JGOPIZ0TSY1TKCNTJ2TYK52JIMGQA
CLIENT_SECRET:1EAHYWCXXASFPLZ1CLTPIBYBSXSR4QGT4GBZMAMCWARSTQRP


In [18]:
new_cdf.loc[3, 'Neighborhood']

'Lawrence Manor, Lawrence Heights'

In [19]:
neighborhood_latitude = new_cdf.loc[3, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = new_cdf.loc[3, 'Longitude'] # neighborhood longitude value

neighborhood_name = new_cdf.loc[3, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Lawrence Manor, Lawrence Heights are 43.718517999999996, -79.46476329999999.


In [20]:
#This url searches the venues json for Toronto

Limit = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    Limit)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=1WWVRT2FB434MDFGNM2JGOPIZ0TSY1TKCNTJ2TYK52JIMGQA&client_secret=1EAHYWCXXASFPLZ1CLTPIBYBSXSR4QGT4GBZMAMCWARSTQRP&v=20180605&ll=43.6534817,-79.3839347&radius=500&limit=100'

In [21]:
#This is the # of venues retured by Foursquare for Toronto compared to 100 returned by Foursquare for New York

results = requests.get(url).json()
len(results['response']['groups'][0]['items'])

90

In [22]:
results['response']['groups'][0]['items'][0:2]

[{'reasons': {'count': 0,
   'items': [{'summary': 'This spot is popular',
     'type': 'general',
     'reasonName': 'globalInteractionReason'}]},
  'venue': {'id': '5227bb01498e17bf485e6202',
   'name': 'Downtown Toronto',
   'location': {'lat': 43.65323167517444,
    'lng': -79.38529600606677,
    'labeledLatLngs': [{'label': 'display',
      'lat': 43.65323167517444,
      'lng': -79.38529600606677}],
    'distance': 113,
    'cc': 'CA',
    'city': 'Toronto',
    'state': 'ON',
    'country': 'Canada',
    'formattedAddress': ['Toronto ON', 'Canada']},
   'categories': [{'id': '4f2a25ac4b909258e854f55f',
     'name': 'Neighborhood',
     'pluralName': 'Neighborhoods',
     'shortName': 'Neighborhood',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/neighborhood_',
      'suffix': '.png'},
     'primary': True}],
   'photos': {'count': 0, 'groups': []}},
  'referralId': 'e-0-5227bb01498e17bf485e6202-0'},
 {'reasons': {'count': 0,
   'items': [{'summar

In [23]:
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)
nearby_venues.head()

Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.events.count,venue.events.summary,venue.id,venue.location.address,venue.location.cc,venue.location.city,...,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.neighborhood,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups,venue.venuePage.id
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-5227bb01498e17bf485e6202-0,"[{'id': '4f2a25ac4b909258e854f55f', 'name': 'N...",,,5227bb01498e17bf485e6202,,CA,Toronto,...,"[{'label': 'display', 'lat': 43.65323167517444...",43.653232,-79.385296,,,ON,Downtown Toronto,0,[],
1,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4ad4c05ef964a520a6f620e3-1,"[{'id': '4bf58dd8d48988d164941735', 'name': 'P...",,,4ad4c05ef964a520a6f620e3,100 Queen St W,CA,Toronto,...,"[{'label': 'display', 'lat': 43.65227047322295...",43.65227,-79.383516,,M5H 2N1,ON,Nathan Phillips Square,0,[],
2,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4ae7b27df964a52068ad21e3-2,"[{'id': '4bf58dd8d48988d1d2941735', 'name': 'S...",,,4ae7b27df964a52068ad21e3,122 Elizabeth St.,CA,Toronto,...,"[{'label': 'display', 'lat': 43.65526771691681...",43.655268,-79.385165,,M5G 1P5,ON,Japango,0,[],
3,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-57bcd3b7498e652a678d0378-3,"[{'id': '5bae9231bedf3950379f89d4', 'name': 'P...",,,57bcd3b7498e652a678d0378,112 Elizabeth St,CA,Toronto,...,"[{'label': 'display', 'lat': 43.65489527525682...",43.654895,-79.385052,,M5G 1P5,ON,Poke Guys,0,[],
4,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-537773d1498e74a75bb75c1e-4,"[{'id': '4bf58dd8d48988d143941735', 'name': 'B...",,,537773d1498e74a75bb75c1e,483 Bay Street,CA,Toronto,...,"[{'label': 'display', 'lat': 43.65314383888587...",43.653144,-79.38198,,M5G 2C9,ON,Eggspectation Bell Trinity Square,0,[],97507838.0


In [24]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [25]:
#this cell cleans up the json_normalized table above

# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['venue.name','venue.categories','venue.location.address'] #+ [col for col in nearby_venues.columns if col.startswith('location.')] + ['id']
venues_nearby = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
venues_nearby['venue.categories'] = venues_nearby.apply(get_category_type, axis=1)

# clean column names by keeping only last term
venues_nearby.columns = [column.split('.')[-1] for column in venues_nearby.columns]

venues_nearby.head(10)

Unnamed: 0,name,categories,address
0,Downtown Toronto,Neighborhood,
1,Nathan Phillips Square,Plaza,100 Queen St W
2,Japango,Sushi Restaurant,122 Elizabeth St.
3,Poke Guys,Poke Place,112 Elizabeth St
4,Eggspectation Bell Trinity Square,Breakfast Spot,483 Bay Street
5,Indigo,Bookstore,220 Yonge St
6,Chatime 日出茶太,Bubble Tea Shop,132 Dundas St W
7,Old City Hall,Monument / Landmark,60 Queen Street West
8,CF Toronto Eaton Centre,Shopping Mall,220 Yonge St
9,Textile Museum of Canada,Art Museum,55 Centre Avenue


In [26]:
print('{} venues were returned by Foursquare'.format(venues_nearby.shape[0]))

90 venues were returned by Foursquare


In [27]:
#the getNearbyVenues function collects the venues from each neighborhood in the city

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            Limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [28]:
toronto_venues = getNearbyVenues(names=new_cdf['Neighborhood'],
                                   latitudes=new_cdf['Latitude'],
                                   longitudes=new_cdf['Longitude']
                                  )
len(toronto_venues)

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

2156

In [29]:
toronto_venues.shape

(2156, 7)

In [30]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,649 Variety,43.754513,-79.331942,Convenience Store
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [31]:
toronto_onehot = toronto_venues.groupby('Neighborhood')

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
cols=list(toronto_onehot.columns.values)
cols.pop(cols.index('Neighborhood'))
toronto_onehot=toronto_onehot[['Neighborhood']+cols]

toronto_onehot.shape

(2156, 272)

In [32]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 272 uniques categories.


In [33]:
from collections import Counter

In [36]:
toronto_venues_list = Counter(toronto_venues['Venue Category'])
tvl = list(toronto_venues_list.most_common())
tvl;

In [37]:
sum(dict(tvl).values())

2156

In [38]:
#pandas dataframe that will be used to compare to New York Venue data
tdf=pd.DataFrame(tvl, columns=['Venue','Occurences'])
tdf

Unnamed: 0,Venue,Occurences
0,Coffee Shop,187
1,Café,103
2,Restaurant,68
3,Park,53
4,Pizza Place,52
5,Italian Restaurant,44
6,Sandwich Place,43
7,Hotel,41
8,Japanese Restaurant,41
9,Bakery,39


In [39]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037037,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
num_top_venues = 5

for place in toronto_grouped['Neighborhood'].head():
    print("----"+place+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == place].T.reset_index()
    temp.columns = ['VENUE','FREQ']
    temp = temp.iloc[1:]
    temp['FREQ'] = temp['FREQ'].astype(float)
    temp = temp.round({'FREQ': 2})
    print(temp.sort_values('FREQ', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')
    

----Agincourt----
                       VENUE  FREQ
0                     Lounge  0.25
1             Breakfast Spot  0.25
2  Latin American Restaurant  0.25
3               Skating Rink  0.25
4              Metro Station  0.00


----Alderwood, Long Branch----
            VENUE  FREQ
0     Pizza Place  0.25
1        Pharmacy  0.12
2             Pub  0.12
3             Gym  0.12
4  Sandwich Place  0.12


----Bathurst Manor, Wilson Heights, Downsview North----
                       VENUE  FREQ
0                       Bank  0.09
1                Coffee Shop  0.09
2              Shopping Mall  0.04
3  Middle Eastern Restaurant  0.04
4          Mobile Phone Shop  0.04


----Bayview Village----
                 VENUE  FREQ
0                 Café  0.25
1                 Bank  0.25
2   Chinese Restaurant  0.25
3  Japanese Restaurant  0.25
4    Accessories Store  0.00


----Bedford Park, Lawrence Manor East----
                     VENUE  FREQ
0       Italian Restaurant  0.11
1              Co

In [57]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [58]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(10)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Lounge,Breakfast Spot,Skating Rink,Yoga Studio,Drugstore,Distribution Center,Dog Run,Doner Restaurant,Donut Shop
1,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Sandwich Place,Pharmacy,Pool,Pub,Gym,Airport Terminal,Farmers Market,Event Space
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Shopping Mall,Convenience Store,Supermarket,Ice Cream Shop,Sushi Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Diner
3,Bayview Village,Café,Bank,Chinese Restaurant,Japanese Restaurant,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Restaurant,Sandwich Place,Coffee Shop,Comfort Food Restaurant,Café,Pub,Indian Restaurant,Sushi Restaurant,Hobby Shop
5,Berczy Park,Coffee Shop,Restaurant,Beer Bar,Bakery,Cocktail Bar,Café,Farmers Market,Seafood Restaurant,Cheese Shop,Breakfast Spot
6,"Birch Cliff, Cliffside West",College Stadium,General Entertainment,Skating Rink,Café,Comic Shop,Dim Sum Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
7,"Brockton, Parkdale Village, Exhibition Place",Café,Yoga Studio,Bakery,Breakfast Spot,Coffee Shop,Stadium,Burrito Place,Restaurant,Climbing Gym,Pet Store
8,"Business reply mail Processing Centre, South C...",Light Rail Station,Yoga Studio,Garden Center,Recording Studio,Skate Park,Burrito Place,Auto Workshop,Fast Food Restaurant,Farmers Market,Spa
9,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Sculpture Garden,Harbor / Marina,Bar,Plane,Coffee Shop,Boat or Ferry,Rental Car Location


In [59]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:12]

array([2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2], dtype=int32)

In [60]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = new_cdf

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged['Cluster Labels']=toronto_merged['Cluster Labels'].fillna(0.0).astype(int)    
toronto_merged.tail() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,2,River,Smoke Shop,Pool,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run
99,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,2,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Dance Studio,Bubble Tea Shop,Pub,Café,Yoga Studio
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,2,Light Rail Station,Yoga Studio,Garden Center,Recording Studio,Skate Park,Burrito Place,Auto Workshop,Fast Food Restaurant,Farmers Market,Spa
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,4,Baseball Field,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore,Fast Food Restaurant
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,2,Sandwich Place,Convenience Store,Discount Store,Tanning Salon,Burrito Place,Flower Shop,Burger Joint,Thrift / Vintage Store,Supplement Shop,Social Club


In [61]:
#the above dataframe gives the most popular venues for all 103 neighborhoods in Toronto that were initially downloaded
toronto_merged.shape

(103, 16)

In [92]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [103]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [46]:
# range is set to 6 so the cluster label is not included in the dataframe
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,"Islington Avenue, Humber Valley Village",,,,,,,,,,
8,"Parkview Hill, Woodbine Gardens",Pizza Place,Pharmacy,Athletics & Sports,Gastropub,Intersection,Café,Bank,Gym / Fitness Center,American Restaurant,Comfort Food Restaurant
11,"West Deane Park, Princess Gardens, Martin Grov...",,,,,,,,,,
35,"East Toronto, Broadview North (Old East York)",Intersection,Park,Convenience Store,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
45,"York Mills, Silver Hills",,,,,,,,,,
50,Humber Summit,Gym,Pizza Place,Grocery Store,Department Store,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore
63,"Runnymede, The Junction North",Convenience Store,Grocery Store,Pizza Place,Brewery,Drugstore,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
70,Westmount,Pizza Place,Chinese Restaurant,Sandwich Place,Coffee Shop,Discount Store,Intersection,Yoga Studio,Diner,Distribution Center,Dog Run
72,"Willowdale, Willowdale West",Pharmacy,Pizza Place,Bank,Coffee Shop,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
82,"Clarks Corners, Tam O'Shanter, Sullivan",Pizza Place,Fast Food Restaurant,Convenience Store,Fried Chicken Joint,Thai Restaurant,Italian Restaurant,Chinese Restaurant,Gas Station,Noodle House,Bank


In [47]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,Food & Drink Shop,Park,Convenience Store,Yoga Studio,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop
21,Caledonia-Fairbanks,Park,Women's Store,Pool,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run
52,"Willowdale, Newtonbrook",Park,Yoga Studio,Drugstore,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
64,Weston,Park,Yoga Studio,Drugstore,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
66,York Mills West,Park,Convenience Store,Yoga Studio,Drugstore,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop
85,"Milliken, Agincourt North, Steeles East, L'Amo...",Park,Playground,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
91,Rosedale,Park,Playground,Trail,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Donut Shop


In [48]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + [2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,Victoria Village,French Restaurant,Pizza Place,Coffee Shop,Financial or Legal Service,Portuguese Restaurant,Hockey Arena,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner
2,Downtown Toronto,"Regent Park, Harbourfront",Coffee Shop,Park,Bakery,Pub,Breakfast Spot,Café,Theater,Beer Store,Shoe Store,Restaurant
3,North York,"Lawrence Manor, Lawrence Heights",Furniture / Home Store,Clothing Store,Vietnamese Restaurant,Boutique,Gift Shop,Accessories Store,Coffee Shop,Event Space,Concert Hall,Comic Shop
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",Coffee Shop,Diner,College Auditorium,Bar,Beer Bar,Smoothie Shop,Sandwich Place,Burrito Place,Café,Portuguese Restaurant
7,North York,Don Mills,Gym,Japanese Restaurant,Beer Store,Restaurant,Coffee Shop,Café,Construction & Landscaping,Chinese Restaurant,Sandwich Place,Bike Shop
9,Downtown Toronto,"Garden District, Ryerson",Clothing Store,Coffee Shop,Café,Japanese Restaurant,Cosmetics Shop,Italian Restaurant,Bubble Tea Shop,Diner,Theater,Ramen Restaurant
10,North York,Glencairn,Park,Pub,Sushi Restaurant,Japanese Restaurant,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
12,Scarborough,"Rouge Hill, Port Union, Highland Creek",Bar,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore,Fast Food Restaurant
13,North York,Don Mills,Gym,Japanese Restaurant,Beer Store,Restaurant,Coffee Shop,Café,Construction & Landscaping,Chinese Restaurant,Sandwich Place,Bike Shop
14,East York,Woodbine Heights,Bus Stop,Park,Skating Rink,Beer Store,Video Store,Athletics & Sports,Dance Studio,Curling Ice,Doner Restaurant,Discount Store


In [49]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,"Malvern, Rouge",Fast Food Restaurant,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Yoga Studio,Dessert Shop


In [50]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + [2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
57,North York,"Humberlea, Emery",Baseball Field,Food Service,Yoga Studio,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore,Dim Sum Restaurant
101,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",Baseball Field,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore,Fast Food Restaurant


In [51]:
string_name = 'New York Data'
print('-------'+string_name.upper()+'-------')

-------NEW YORK DATA-------


In [62]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent='jamundson@mtech.edu')
location = geolocator.geocode(address)
ny_latitude = location.latitude
ny_longitude = location.longitude
print('The cordinates of New York City are {}, {}.'.format(latitude,longitude))

The cordinates of New York City are 43.6534817, -79.3839347.


In [63]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

Data downloaded!


In [64]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [65]:
# 306 is the number of neighborhoods collected from the New York data
# 103 neighborhoods collected from Toronto
ny_d = newyork_data['features']
len(ny_d)

306

In [66]:
ny_d[0:2]

[{'type': 'Feature',
  'id': 'nyu_2451_34572.1',
  'geometry': {'type': 'Point',
   'coordinates': [-73.84720052054902, 40.89470517661]},
  'geometry_name': 'geom',
  'properties': {'name': 'Wakefield',
   'stacked': 1,
   'annoline1': 'Wakefield',
   'annoline2': None,
   'annoline3': None,
   'annoangle': 0.0,
   'borough': 'Bronx',
   'bbox': [-73.84720052054902,
    40.89470517661,
    -73.84720052054902,
    40.89470517661]}},
 {'type': 'Feature',
  'id': 'nyu_2451_34572.2',
  'geometry': {'type': 'Point',
   'coordinates': [-73.82993910812398, 40.87429419303012]},
  'geometry_name': 'geom',
  'properties': {'name': 'Co-op City',
   'stacked': 2,
   'annoline1': 'Co-op',
   'annoline2': 'City',
   'annoline3': None,
   'annoangle': 0.0,
   'borough': 'Bronx',
   'bbox': [-73.82993910812398,
    40.87429419303012,
    -73.82993910812398,
    40.87429419303012]}}]

In [67]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
ny_neighborhoods = pd.DataFrame(columns=column_names)

In [68]:
#assigning the json data to the pandas dataframe created above

for data in ny_d:
    borough = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    ny_neighborhoods = ny_neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [69]:
ny_neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [70]:
print('New York City has {} boroughs and {} neighborhoods.'.format(
        len(ny_neighborhoods['Borough'].unique()),
        ny_neighborhoods['Neighborhood'].count(),
        ny_neighborhoods.shape[0]
    )
)

New York City has 5 boroughs and 306 neighborhoods.


In [59]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(ny_neighborhoods['Latitude'], ny_neighborhoods['Longitude'], ny_neighborhoods['Borough'], ny_neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [71]:
#this call searches all the venues in the neighborhoods of New York City
Limit = 100
radius = 500

url_ny = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    ny_latitude, 
    ny_longitude, 
    radius, 
    Limit)

url_ny

'https://api.foursquare.com/v2/venues/explore?&client_id=1WWVRT2FB434MDFGNM2JGOPIZ0TSY1TKCNTJ2TYK52JIMGQA&client_secret=1EAHYWCXXASFPLZ1CLTPIBYBSXSR4QGT4GBZMAMCWARSTQRP&v=20180605&ll=40.7127281,-74.0060152&radius=500&limit=100'

In [72]:
results_ny = requests.get(url_ny).json()
results_ny['response'].keys()

dict_keys(['suggestedFilters', 'headerLocation', 'headerFullLocation', 'headerLocationGranularity', 'totalResults', 'suggestedBounds', 'groups'])

In [73]:
#breaks the json file down from response dictionary ->groups dictionary ->first element in list 'groups' ->items dictionary
results_ny['response']['groups'][0]['items'][:2]

[{'reasons': {'count': 0,
   'items': [{'summary': 'This spot is popular',
     'type': 'general',
     'reasonName': 'globalInteractionReason'}]},
  'venue': {'id': '57f0689d498e7d49d9189369',
   'name': 'The Bar Room at Temple Court',
   'location': {'address': '123 Nassau St',
    'lat': 40.7114477287544,
    'lng': -74.00680157032005,
    'labeledLatLngs': [{'label': 'display',
      'lat': 40.7114477287544,
      'lng': -74.00680157032005}],
    'distance': 157,
    'postalCode': '10038',
    'cc': 'US',
    'neighborhood': 'Financial District',
    'city': 'New York',
    'state': 'NY',
    'country': 'United States',
    'formattedAddress': ['123 Nassau St',
     'New York, NY 10038',
     'United States']},
   'categories': [{'id': '4bf58dd8d48988d1d5941735',
     'name': 'Hotel Bar',
     'pluralName': 'Hotel Bars',
     'shortName': 'Hotel Bar',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/travel/hotel_bar_',
      'suffix': '.png'},
     'primary': True}],

In [74]:
venues_ny = results_ny['response']['groups'][0]['items']    
nearby_venues_ny = json_normalize(venues_ny) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues_ny =nearby_venues_ny.loc[:, filtered_columns]

# filter the category for each row
nearby_venues_ny['venue.categories'] = nearby_venues_ny.apply(get_category_type, axis=1)

# clean columns
nearby_venues_ny.columns = [col.split(".")[-1] for col in nearby_venues_ny.columns]
nearby_venues_ny.head()

Unnamed: 0,name,categories,lat,lng
0,The Bar Room at Temple Court,Hotel Bar,40.711448,-74.006802
1,Alba Dry Cleaner & Tailor,Laundry Service,40.711434,-74.006272
2,"The Beekman, A Thompson Hotel",Hotel,40.711173,-74.006702
3,Gibney Dance Center Downtown,Dance Studio,40.713923,-74.005661
4,City Hall Park,Park,40.712415,-74.006724


In [75]:
print('{} venues were returned by Foursquare.'.format(nearby_venues_ny.shape[0]))
#100 venues compared to the 88 venues retured for Toronto 

100 venues were returned by Foursquare.


In [76]:
ny_venues = getNearbyVenues(names=ny_neighborhoods['Neighborhood'],
                                   latitudes=ny_neighborhoods['Latitude'],
                                   longitudes=ny_neighborhoods['Longitude']
                                  )

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [77]:
len(ny_venues)
#compared to the 2156 venues given by searching toronto neighborhoods

10047

In [78]:
print(ny_venues.shape)
ny_venues.head()

(10047, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
2,Wakefield,40.894705,-73.847201,Walgreens,40.896528,-73.8447,Pharmacy
3,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
4,Wakefield,40.894705,-73.847201,Shell,40.894187,-73.845862,Gas Station


In [79]:
ny_onehot = ny_venues.groupby('Neighborhood')

# one hot encoding
ny_onehot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ny_onehot['Neighborhood'] = ny_venues['Neighborhood'] 

# move neighborhood column to the first column
cols=list(ny_onehot.columns.values)
cols.pop(cols.index('Neighborhood'))
ny_onehot=ny_onehot[['Neighborhood']+cols]

ny_onehot.shape

(10047, 430)

In [80]:
print('There are {} uniques categories.'.format(len(ny_venues['Venue Category'].unique())))
# There are 430 different venue categories for New York compared to 272 for Toronto

There are 430 uniques categories.


In [81]:
ny_venues_occurences = Counter(ny_venues['Venue Category'])
ny_venues_occurences.most_common();

In [82]:
sum(ny_venues_occurences.values())

10047

In [83]:
ny_grouped = ny_onehot.groupby('Neighborhood').mean().reset_index()
print(ny_grouped.shape)
ny_grouped.head()

(302, 430)


Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,Antique Shop,Arcade,Arepa Restaurant,...,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,Allerton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Annadale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arden Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arlington,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arrochar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
num_top_venues = 5

for place in ny_grouped['Neighborhood'].head():
    print("----"+place+"----")
    temp = ny_grouped[ny_grouped['Neighborhood'] == place].T.reset_index()
    temp.columns = ['VENUE','FREQ']
    temp = temp.iloc[1:]
    temp['FREQ'] = temp['FREQ'].astype(float)
    temp = temp.round({'FREQ': 2})
    print(temp.sort_values('FREQ', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Allerton----
                VENUE  FREQ
0         Pizza Place  0.14
1       Deli / Bodega  0.10
2         Supermarket  0.07
3  Chinese Restaurant  0.07
4            Pharmacy  0.03


----Annadale----
          VENUE  FREQ
0   Pizza Place  0.22
1      Pharmacy  0.11
2    Restaurant  0.11
3         Diner  0.11
4  Dance Studio  0.11


----Arden Heights----
                 VENUE  FREQ
0          Coffee Shop  0.25
1          Pizza Place  0.25
2  Rental Car Location  0.25
3             Pharmacy  0.25
4    Accessories Store  0.00


----Arlington----
                 VENUE  FREQ
0         Intersection   0.2
1             Bus Stop   0.2
2  American Restaurant   0.2
3          Coffee Shop   0.2
4        Deli / Bodega   0.2


----Arrochar----
                   VENUE  FREQ
0               Bus Stop  0.14
1          Deli / Bodega  0.10
2     Italian Restaurant  0.10
3  Outdoors & Recreation  0.05
4         Sandwich Place  0.05




In [85]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
ny_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
ny_neighborhoods_venues_sorted['Neighborhood'] = ny_grouped['Neighborhood']

for ind in np.arange(ny_grouped.shape[0]):
    ny_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_grouped.iloc[ind, :], num_top_venues)

ny_neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Allerton,Pizza Place,Deli / Bodega,Chinese Restaurant,Supermarket,Bus Station,Gas Station,Grocery Store,Check Cashing Service,Fried Chicken Joint,Pharmacy
1,Annadale,Pizza Place,Dance Studio,Diner,Train Station,Liquor Store,Food,Pharmacy,Restaurant,French Restaurant,Fish & Chips Shop
2,Arden Heights,Pharmacy,Rental Car Location,Coffee Shop,Pizza Place,Flea Market,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant
3,Arlington,Deli / Bodega,American Restaurant,Bus Stop,Intersection,Coffee Shop,Diner,Food,Farmers Market,Fast Food Restaurant,Field
4,Arrochar,Bus Stop,Italian Restaurant,Deli / Bodega,Polish Restaurant,Bagel Shop,Liquor Store,Sandwich Place,Taco Place,Middle Eastern Restaurant,Hotel


In [86]:
# set number of clusters
kclusters = 5
ny_grouped_clustering = ny_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ny_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:12]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [87]:
# will give error if run more than once
ny_neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [88]:
ny_merged = ny_neighborhoods

# merge ny_grouped with ny_data to add latitude/longitude for each neighborhood
ny_merged = ny_merged.join(ny_neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

#format the cluster labels as integers in order to group neighborhoods with NaN venue data
ny_merged['Cluster Labels']=ny_merged['Cluster Labels'].fillna(0.0).astype(int)
    
print(ny_merged.shape)
ny_merged.tail() # check the last columns!
# Most popular venues tabulated for all 306 neighborhoods in New York

(306, 15)


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
301,Manhattan,Hudson Yards,40.756658,-74.000111,0,Gym / Fitness Center,Hotel,American Restaurant,Italian Restaurant,Restaurant,Park,Boat or Ferry,Coffee Shop,Gym,Nightclub
302,Queens,Hammels,40.587338,-73.80553,0,Beach,Fried Chicken Joint,Fast Food Restaurant,Shoe Store,Gym / Fitness Center,Food Truck,Dog Run,Deli / Bodega,Diner,Bus Station
303,Queens,Bayswater,40.611322,-73.765968,4,Playground,Construction & Landscaping,Zoo Exhibit,Flower Shop,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Financial or Legal Service
304,Queens,Queensbridge,40.756091,-73.945631,0,Sandwich Place,Hotel,Hotel Bar,Roof Deck,Basketball Court,Baseball Field,Gym / Fitness Center,Performing Arts Venue,Park,Scenic Lookout
305,Staten Island,Fox Hills,40.617311,-74.08174,0,Grocery Store,Bus Stop,Playground,Sandwich Place,Flower Shop,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant


In [96]:
# create map
ny_map_clusters = folium.Map(location=[ny_latitude, ny_longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_merged['Latitude'], ny_merged['Longitude'], ny_merged['Neighborhood'], ny_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(ny_map_clusters)
       
ny_map_clusters

In [87]:
#the ny_merged dataframe does not included postal code like the toronto_merged dataframe, so the range is set to 5 not 6
#the first 5 columns are taken out of ny_merged to create this dataframe
ny_merged.loc[ny_merged['Cluster Labels'] == 0, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Wakefield,Pharmacy,Laundromat,Pizza Place,Sandwich Place,Donut Shop,Ice Cream Shop,Gas Station,Dessert Shop,Deli / Bodega,French Restaurant
1,Co-op City,Bus Station,Post Office,Fried Chicken Joint,Grocery Store,Park,Fast Food Restaurant,Pharmacy,Basketball Court,Baseball Field,Bagel Shop
2,Eastchester,Bus Stop,Diner,Deli / Bodega,Caribbean Restaurant,Bus Station,Convenience Store,Business Service,Bowling Alley,Food & Drink Shop,Chinese Restaurant
3,Fieldston,River,Bus Station,Business Service,Plaza,Zoo Exhibit,Flea Market,Farm,Farmers Market,Fast Food Restaurant,Field
4,Riverdale,Park,Bus Station,Bank,Food Truck,Plaza,Locksmith,Gym,Zoo Exhibit,Farmers Market,Fast Food Restaurant
5,Kingsbridge,Pizza Place,Bar,Bakery,Latin American Restaurant,Mexican Restaurant,Sandwich Place,Fried Chicken Joint,Liquor Store,Donut Shop,Pharmacy
6,Marble Hill,Gym,Coffee Shop,Sandwich Place,Department Store,Video Game Store,Miscellaneous Shop,Shopping Mall,Pharmacy,Seafood Restaurant,Tennis Stadium
7,Woodlawn,Deli / Bodega,Pub,Pizza Place,Playground,Indian Restaurant,Trail,Park,Bar,Donut Shop,Cosmetics Shop
8,Norwood,Pizza Place,Park,Bank,Pharmacy,Plaza,Coffee Shop,Mobile Phone Shop,Grocery Store,Pet Store,Chinese Restaurant
9,Williamsbridge,Caribbean Restaurant,Bar,Nightclub,Soup Place,Zoo Exhibit,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Financial or Legal Service


In [88]:
ny_merged.loc[ny_merged['Cluster Labels'] == 1, ny_merged.columns[[0,1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
192,Queens,Somerville,Park,Zoo Exhibit,Factory,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop
203,Staten Island,Todt Hill,Park,Zoo Exhibit,Factory,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop


In [89]:
ny_merged.loc[ny_merged['Cluster Labels'] == 2, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
179,Neponsit,Beach,Zoo Exhibit,Flower Shop,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop


In [90]:
ny_merged.loc[ny_merged['Cluster Labels'] == 3, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
207,Port Ivory,Bar,Flower Shop,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop


In [91]:
ny_merged.loc[ny_merged['Cluster Labels'] == 4, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
255,Emerson Hill,Construction & Landscaping,Zoo Exhibit,Food,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop
303,Bayswater,Playground,Construction & Landscaping,Zoo Exhibit,Flea Market,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant


In [71]:
print('TORONTO vs NEW YORK \n      VENUES')

TORONTO vs NEW YORK 
      VENUES


In [89]:
toronto_venues_list = Counter(toronto_venues['Venue Category'])
tvl=list(toronto_venues_list.most_common())
tvl;

In [90]:
tdf=pd.DataFrame(tvl,columns=['Venue','Toronto_total'])
tdf.head()

Unnamed: 0,Venue,Toronto_total
0,Coffee Shop,187
1,Café,103
2,Restaurant,68
3,Park,53
4,Pizza Place,52


In [91]:
ny_venues_occurences = Counter(ny_venues['Venue Category'])
nyvl=list(ny_venues_occurences.most_common())
nyvl;

In [92]:
nydf=pd.DataFrame(nyvl,columns=['Venue','NY_total'])
nydf.head()

Unnamed: 0,Venue,NY_total
0,Pizza Place,440
1,Coffee Shop,314
2,Italian Restaurant,311
3,Deli / Bodega,253
4,Bakery,226


In [93]:
NYCvTOR = pd.merge(nydf,tdf,how='outer',on='Venue')
NYCvTOR.head()

Unnamed: 0,Venue,NY_total,Toronto_total
0,Pizza Place,440.0,52.0
1,Coffee Shop,314.0,187.0
2,Italian Restaurant,311.0,44.0
3,Deli / Bodega,253.0,12.0
4,Bakery,226.0,39.0


In [96]:
#ratio that could be used to make for equal comparison between Toronto and New York venues counts
ratio = 10050/2156
ratio = float("{:.3f}".format(ratio))
ratio

4.661

In [97]:
# ratios that could have been used to compare the amount of venues in NYC to Toronto 
print('The population of greater NYC is 3.42 times the amount of Greater Toronto')
print('NYC has 2.87 times as many people as Toronto')
print('Venues returned ratio NYC/TOR:', ratio)

The population of greater NYC is 3.42 times the amount of Greater Toronto
NYC has 2.87 times as many people as Toronto
Venues returned ratio NYC/TOR: 4.661


In [98]:
#numbers in millions
Toronto_annual_visitors = 27.5 
Toronto_population = 2.93

In [99]:
NYC_annual_visitors = 67
NYC_population = 8.4

In [100]:
#the greater city areas were not included in this adjustment 
#adjustment ratio used for datframe comparison
adjustment = (NYC_annual_visitors + NYC_population)/(Toronto_annual_visitors + Toronto_population)
adjustment = float("{:.3f}".format(adjustment))
adjustment

2.478

In [102]:
# a population and tourism adjustment was used to more accuretly compare the number of venues in each city 
# a more precise adjustment should be used when singling out each venue
# different venue adjustment ratios could be used and displayed in the highlighted dataframe by changing the first line in this cell
NY_adjusted = NYCvTOR['NY_total']/adjustment
NY_adjusted = NY_adjusted.round(1)
NY_adjusted.head();

In [103]:
NYCvTOR_adjusted = pd.concat([NYCvTOR,NY_adjusted],axis=1)
df = NYCvTOR_adjusted.set_axis(['Venue','NY_total','Toronto_total','NY_adjusted'],axis=1,inplace=False)
df.head()

Unnamed: 0,Venue,NY_total,Toronto_total,NY_adjusted
0,Pizza Place,440.0,52.0,177.6
1,Coffee Shop,314.0,187.0,126.7
2,Italian Restaurant,311.0,44.0,125.5
3,Deli / Bodega,253.0,12.0,102.1
4,Bakery,226.0,39.0,91.2


In [104]:
df=df.fillna(0)
df.tail()

Unnamed: 0,Venue,NY_total,Toronto_total,NY_adjusted
457,Hospital,0.0,1.0,0.0
458,Bed & Breakfast,0.0,1.0,0.0
459,Airport Food Court,0.0,1.0,0.0
460,Plane,0.0,1.0,0.0
461,Auto Workshop,0.0,1.0,0.0


In [105]:
#function used to color code the dataframe for easier comparison
def highlight_values(df):
    
    if df['NY_adjusted'] < 2 and df['Toronto_total'] == 0:
        return ['background-color: white']*4
    
    elif df['NY_adjusted'] < 1 and df['Toronto_total'] < 2:
        return ['background-color: white']*4
    
    elif df['NY_adjusted'] > 5 * df['Toronto_total'] and df['NY_adjusted'] >= 5:
        return ['background-color: red']*4
    
    elif df['NY_adjusted'] > 2 * df['Toronto_total']:
        return ['background-color: orange']*4
    
    elif df['Toronto_total'] > 5 * df['NY_adjusted'] and df['Toronto_total'] >=5:
        return ['background-color: blue']*4
    
    elif df['NY_adjusted'] < 0.5 * df['Toronto_total']:
        return ['background-color: aqua']*4 
    
    else:
        return ['background-color: white']*4

In [106]:
# using different parameters for the adjustment constant would change how the venues are highlighted
# error in the adjustment likely explains why there is way more red than blue 
final_df=df.style.apply(highlight_values, axis=1)

In [107]:
color_legend = pd.DataFrame({'Venue Presence:':["NYC >>>","NYC >",'Similar',"Toronto >","Toronto >>>"], 'Color:':["RED","ORANGE",'WHITE',"LIGHT BLUE","BLUE"]})
color_legend;

In [108]:
print('TORONTO vs NEW YORK \n     COMPARISON \n')
print(color_legend)
final_df

TORONTO vs NEW YORK 
     COMPARISON 

  Venue Presence:      Color:
0         NYC >>>         RED
1           NYC >      ORANGE
2         Similar       WHITE
3       Toronto >  LIGHT BLUE
4     Toronto >>>        BLUE


Unnamed: 0,Venue,NY_total,Toronto_total,NY_adjusted
0,Pizza Place,440,52,177.6
1,Coffee Shop,314,187,126.7
2,Italian Restaurant,311,44,125.5
3,Deli / Bodega,253,12,102.1
4,Bakery,226,39,91.2
5,Bar,220,33,88.8
6,Chinese Restaurant,209,17,84.3
7,Grocery Store,195,27,78.7
8,Sandwich Place,186,43,75.1
9,Mexican Restaurant,178,14,71.8
