In [7]:
!pip install beautifulsoup4
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
from sklearn.cluster import KMeans
import folium 
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 15.7MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0" (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/02/fb/1c65691a9aeb7bd6ac2aa505b84cb8b49ac29c976411c6ab3659425e045f/soupsieve-2.1-py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.3 soupsieve-2.1


In [8]:
#link to site, parse, and find table
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
soup  = BeautifulSoup(source, "html.parser")
table = soup.find('table')

In [9]:
#df: PostalCode, Borough, Neighborhood
col_names = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = col_names)

#find and add each postalcode, borough, then neighborhood
for tr in table.find_all('tr'):
    row = []
    for td in tr.find_all('td'):
        row.append(td.text.strip())
    if len(row) == 3:
        df.loc[len(df)] = row

In [10]:
#check table
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [11]:
#remove unassigned
df = df[df['Borough'] != 'Not assigned']
#check table
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [12]:
#df size
df.shape

(103, 3)

In [13]:
#define coordinates for each postal code?
def get_geocode(postal_code):
    # initialize your variable to None
    coords = None
    while(coords is None):
        geo = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        coords = geo.latlng
    lat = coords[0]
    long = coords[1]
    return lat,long
#unsuccessful...

In [14]:
#read csv for coordinates
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [54]:
#combine tables
df_geo.rename(columns={'Postal Code':'Postalcode'}, inplace=True)
geo_merged = pd.merge(df_geo, df, on = 'Postalcode')
geo_data = geo_merged[['Postalcode','Borough','Neighborhood','Latitude','Longitude']]
geo_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [16]:
CLIENT_ID = 'BCDLA13GUNOML3D4UFMSMKC3ECUGTCOMVXB5C1ZKLTULZZHJ' # your Foursquare ID
CLIENT_SECRET = '0HXQSO4JFJD0VLSDVD55DU3XKLLWSDXZYXR5CUUS5ETS4GKV' # your Foursquare Secret
VERSION = '20180604'

In [17]:
#I'm planning a bar crawl with some friends and want to find the Toronto boroughs with the most bars'
def getNearbyBars(names, latitudes, longitudes):
    search_query = 'Bar'
    radius=100
    LIMIT=10
    bar_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION, search_query, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return information about each venue
        bar_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_bars = pd.DataFrame([item for bar_list in bar_list for item in bar_list])
    nearby_bars.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_bars)

In [18]:
bars = getNearbyBars(names = geo_data['Neighborhood'],
                                   latitudes = geo_data['Latitude'],
                                   longitudes = geo_data['Longitude']
                                  )

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale, Willowdale East
York Mills West
Willowdale, Willowdale West
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto, Broadview North (Old East York)
The Danforth West, 

In [19]:
bars.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Davisville,43.704324,-79.38879,Zee Grill,43.704985,-79.388476,Seafood Restaurant
1,Davisville,43.704324,-79.38879,Crescendo Oil & Vinegar + Spices,43.703663,-79.388281,Winery
2,Church and Wellesley,43.66586,-79.38316,Fox & Fiddle,43.665031,-79.383098,Pub
3,Church and Wellesley,43.66586,-79.38316,Pride Toronto HQ,43.666203,-79.383833,Performing Arts Venue
4,"Regent Park, Harbourfront",43.65426,-79.360636,Henrietta Lane,43.653911,-79.361395,Cocktail Bar


In [24]:
bars.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Church and Wellesley,2,2,2,2,2,2
"Commerce Court, Victoria Hotel",2,2,2,2,2,2
Davisville,2,2,2,2,2,2
"First Canadian Place, Underground city",2,2,2,2,2,2
"Garden District, Ryerson",1,1,1,1,1,1
"Kensington Market, Chinatown, Grange Park",4,4,4,4,4,4
"Little Portugal, Trinity",3,3,3,3,3,3
"Regent Park, Harbourfront",1,1,1,1,1,1
"Richmond, Adelaide, King",6,6,6,6,6,6
St. James Town,4,4,4,4,4,4


In [25]:
# one hot encoding
toronto_onehot = pd.get_dummies(bars[['Venue Category']], prefix = "", prefix_sep = "")
toronto_onehot.insert(loc=0, column = 'Neighborhood', value = bars['Neighborhood'] )
toronto_onehot.shape

(34, 19)

In [26]:
toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Bar,Breakfast Spot,Brewery,Cocktail Bar,Coffee Shop,Gastropub,Gay Bar,Hotel Bar,Italian Restaurant,Karaoke Bar,Liquor Store,Performing Arts Venue,Pub,Restaurant,Seafood Restaurant,Winery
0,Davisville,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,Davisville,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,Church and Wellesley,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,Church and Wellesley,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Bar,Breakfast Spot,Brewery,Cocktail Bar,Coffee Shop,Gastropub,Gay Bar,Hotel Bar,Italian Restaurant,Karaoke Bar,Liquor Store,Performing Arts Venue,Pub,Restaurant,Seafood Restaurant,Winery
0,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0
1,"Commerce Court, Victoria Hotel",0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
2,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5
3,"First Canadian Place, Underground city",0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
4,"Garden District, Ryerson",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [55]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns = columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Church and Wellesley,Pub,Performing Arts Venue,Winery
1,"Commerce Court, Victoria Hotel",American Restaurant,Pub,Gastropub
2,Davisville,Winery,Seafood Restaurant,Asian Restaurant
3,"First Canadian Place, Underground city",Liquor Store,Coffee Shop,Winery
4,"Garden District, Ryerson",Gastropub,Winery,Seafood Restaurant


In [57]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 2, 1, 1, 3, 1, 1, 0, 1, 1], dtype=int32)

In [58]:
toronto_grouped_clustering.head()

Unnamed: 0,American Restaurant,Asian Restaurant,Bar,Breakfast Spot,Brewery,Cocktail Bar,Coffee Shop,Gastropub,Gay Bar,Hotel Bar,Italian Restaurant,Karaoke Bar,Liquor Store,Performing Arts Venue,Pub,Restaurant,Seafood Restaurant,Winery
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0
1,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
#add clusters
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,2,Church and Wellesley,Pub,Performing Arts Venue,Winery
1,2,"Commerce Court, Victoria Hotel",American Restaurant,Pub,Gastropub
2,1,Davisville,Winery,Seafood Restaurant,Asian Restaurant
3,1,"First Canadian Place, Underground city",Liquor Store,Coffee Shop,Winery
4,3,"Garden District, Ryerson",Gastropub,Winery,Seafood Restaurant
5,1,"Kensington Market, Chinatown, Grange Park",Bar,Karaoke Bar,Cocktail Bar
6,1,"Little Portugal, Trinity",Asian Restaurant,Bar,Brewery
7,0,"Regent Park, Harbourfront",Cocktail Bar,Winery,Seafood Restaurant
8,1,"Richmond, Adelaide, King",Restaurant,Hotel Bar,Breakfast Spot
9,1,St. James Town,Bar,Gay Bar,Italian Restaurant


In [67]:
#add lat&long for each neighborhood
toronto_bars = pd.merge(neighborhoods_venues_sorted, geo_data, on='Neighborhood', how = 'left')
toronto_bars.head()

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,Postalcode,Borough,Latitude,Longitude
0,2,Church and Wellesley,Pub,Performing Arts Venue,Winery,M4Y,Downtown Toronto,43.66586,-79.38316
1,2,"Commerce Court, Victoria Hotel",American Restaurant,Pub,Gastropub,M5L,Downtown Toronto,43.648198,-79.379817
2,1,Davisville,Winery,Seafood Restaurant,Asian Restaurant,M4S,Central Toronto,43.704324,-79.38879
3,1,"First Canadian Place, Underground city",Liquor Store,Coffee Shop,Winery,M5X,Downtown Toronto,43.648429,-79.38228
4,3,"Garden District, Ryerson",Gastropub,Winery,Seafood Restaurant,M5B,Downtown Toronto,43.657162,-79.378937


In [79]:
#show bar venue clusters
map_clusters = folium.Map(location=[43.66, -79.38], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_bars['Latitude'], toronto_bars['Longitude'], toronto_bars['Neighborhood'], toronto_bars['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters