In [1]:
import urllib.request

fp = urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
mybytes = fp.read()

mystr = mybytes.decode("utf8")
fp.close()

<i>The following code parses HTML from the provided Wikipedia article. A number of assumptions are made, specifically that the table containing postal code information is the first table on the page and that the columns correspond to the expected values. The code iterates over each row in the table, extracting postcode, borough, and neighborhood information. These sets of information are concatenated into a single dataframe.</i>

In [2]:
from bs4 import BeautifulSoup
postcode_data = [];

from bs4 import BeautifulSoup

soup = BeautifulSoup(mystr,"html.parser")
for table in soup.find_all('table'):
    for table_row in table.find_all('tr'):
        cells = table_row.find_all('td')
        cells = [ele.text.strip() for ele in cells]
        if (len(cells)>2):
            postcode = cells[0]
            borough = cells[1]
            neighborhood = cells[2]
            if (borough != 'Not assigned'):
                if (neighborhood == 'Not assigned'):
                    neighborhood = borough
                postcode_found = False
                for e in postcode_data:
                    if (e[0] == postcode):
                        postcode_found = True
                        e[2] = e[2] + ", " + neighborhood
                if (postcode_found == False):
                    entry = [postcode,borough,neighborhood]
                    postcode_data.append(entry);
    break
    


In [3]:
import pandas as pd
neighborhoods = pd.DataFrame(postcode_data,columns=['Postal Code','Borough','Neighborhood'])
neighborhoods

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [4]:
print(neighborhoods.shape)

(103, 3)


<i>Read latitude and longitude geospacial data from cocl.us and decode it into a dataframe</i>

In [5]:
import pandas as pd
import io
import requests
url="http://cocl.us/Geospatial_data"
s=requests.get(url).content
coords_df=pd.read_csv(io.StringIO(s.decode('utf-8')))

<i>Extract latitude and longitude information for each postcode and append it to the current dataframe</i>

In [6]:
latitudes = []
longitudes = []
for index,r in neighborhoods.iterrows():
    postal_code = r['Postal Code']
    coord_data = coords_df[coords_df['Postal Code'] == postal_code]
    latitudes.append(coord_data['Latitude'].iloc[0])
    longitudes.append(coord_data['Longitude'].iloc[0])
neighborhoods['Latitude'] = latitudes
neighborhoods['Longitude'] = longitudes
neighborhoods

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [7]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  51.84 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  25.25 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  29.85 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  29.40 MB/s


In [8]:
address = 'Toronto, ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [9]:
map_toronto = folium.Map(location=[latitude,longitude], zoom_start=13)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

In [10]:
CLIENT_ID = 'VUP1TBJLXVSHUMRWZJMSYBR43F4UNKOKI2UDJYI4YKHC0AKQ' # your Foursquare ID
CLIENT_SECRET = '0L55FZILWPVWS5D2KV2DAUSFCF0YHF0YG23GCWQL3MUTVRGG' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: VUP1TBJLXVSHUMRWZJMSYBR43F4UNKOKI2UDJYI4YKHC0AKQ
CLIENT_SECRET:0L55FZILWPVWS5D2KV2DAUSFCF0YHF0YG23GCWQL3MUTVRGG


In [11]:
neighborhood_latitude = neighborhoods.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = neighborhoods.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = neighborhoods.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Parkwoods are 43.7532586, -79.3296565.


In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat,lng, 
            radius, 
            100)
            
        print(url)
        # make the GET request
        result = requests.get(url)
        print(result)
        print(result.json())
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
neighborhoods
toronto_data = neighborhoods[neighborhoods['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data
toronto_venues = getNearbyVenues(toronto_data['Neighborhood'],toronto_data['Latitude'],toronto_data['Longitude'])
print(toronto_venues.shape)
toronto_venues.head()
toronto_venues.groupby('Neighborhood').count()
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

Harbourfront, Regent Park
https://api.foursquare.com/v2/venues/explore?&client_id=VUP1TBJLXVSHUMRWZJMSYBR43F4UNKOKI2UDJYI4YKHC0AKQ&client_secret=0L55FZILWPVWS5D2KV2DAUSFCF0YHF0YG23GCWQL3MUTVRGG&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100
<Response [200]>
{'response': {'suggestedBounds': {'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}, 'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486}}, 'suggestedFilters': {'header': 'Tap to show:', 'filters': [{'name': 'Open now', 'key': 'openNow'}]}, 'totalResults': 48, 'headerLocation': 'Corktown', 'headerLocationGranularity': 'neighborhood', 'groups': [{'name': 'recommended', 'type': 'Recommended Places', 'items': [{'reasons': {'items': [{'type': 'general', 'summary': 'This spot is popular', 'reasonName': 'globalInteractionReason'}], 'count': 0}, 'referralId': 'e-0-54ea41ad498e9a11e9e13308-0', 'venue': {'name': 'Roselle Desserts', 'id': '54ea41ad498e9a11e9e13308', 'location': {'crossStreet': 'Trinity St', 'distance': 143,

In [14]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
toronto_onehot.shape

(1697, 234)

In [16]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.01
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012048,0.0,0.0,0.012048,0.0,0.0
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.05,0.0,0.04,0.01,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.011494,0.011494,0.011494,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.011494,0.011494,0.0,0.011494,0.0


In [17]:
toronto_grouped.shape

(38, 234)

In [18]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2  American Restaurant  0.04
3      Thai Restaurant  0.04
4           Steakhouse  0.04


----Berczy Park----
          venue  freq
0   Coffee Shop  0.07
1  Cocktail Bar  0.05
2    Restaurant  0.05
3    Steakhouse  0.04
4           Pub  0.04


----Brockton, Exhibition Place, Parkdale Village----
               venue  freq
0               Café  0.11
1     Breakfast Spot  0.11
2        Coffee Shop  0.11
3  Convenience Store  0.06
4                Gym  0.06


----Business Reply Mail Processing Centre 969 Eastern----
           venue  freq
0     Smoke Shop  0.06
1  Garden Center  0.06
2         Garden  0.06
3  Auto Workshop  0.06
4     Restaurant  0.06


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
              venue  freq
0   Airport Service  0.14
1  Airport Terminal  0.14
2    Airport Lounge  0.14
3  

In [19]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [20]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,American Restaurant,Thai Restaurant,Asian Restaurant,Clothing Store,Gym,Bakery,Bar
1,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Steakhouse,Cheese Shop,Café,Pub,Farmers Market,Seafood Restaurant,Bakery
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Breakfast Spot,Café,Performing Arts Venue,Burrito Place,Bar,Stadium,Caribbean Restaurant,Furniture / Home Store,Climbing Gym
3,Business Reply Mail Processing Centre 969 Eastern,Park,Butcher,Skate Park,Smoke Shop,Farmers Market,Light Rail Station,Fast Food Restaurant,Spa,Brewery,Restaurant
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Service,Airport Terminal,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Sculpture Garden,Plane
5,"Cabbagetown, St. James Town",Pizza Place,Restaurant,Coffee Shop,Italian Restaurant,Indian Restaurant,Bakery,Pub,Café,Market,Park
6,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Chinese Restaurant,Burger Joint,Ice Cream Shop,Bar,Bubble Tea Shop,Spa,Sandwich Place
7,"Chinatown, Grange Park, Kensington Market",Café,Bar,Vegetarian / Vegan Restaurant,Bakery,Dumpling Restaurant,Coffee Shop,Vietnamese Restaurant,Mexican Restaurant,Chinese Restaurant,Gaming Cafe
8,Christie,Café,Grocery Store,Park,Baby Store,Italian Restaurant,Diner,Nightclub,Restaurant,Coffee Shop,Convenience Store
9,Church and Wellesley,Japanese Restaurant,Sushi Restaurant,Coffee Shop,Gay Bar,Restaurant,Burger Joint,Pub,Gastropub,Men's Store,Bubble Tea Shop


In [21]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 0, 3, 3, 3, 3, 3, 3], dtype=int32)

In [22]:
toronto_merged = toronto_data

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,3,Coffee Shop,Café,Park,Bakery,Pub,Mexican Restaurant,Breakfast Spot,Theater,Beer Store,Chocolate Shop
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,3,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Restaurant,Ramen Restaurant,Plaza,Diner,Pizza Place
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3,Coffee Shop,Restaurant,Café,Hotel,Clothing Store,Bakery,Gastropub,Italian Restaurant,Breakfast Spot,Cocktail Bar
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Coffee Shop,Pub,Music Venue,Women's Store,Dim Sum Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,3,Coffee Shop,Cocktail Bar,Restaurant,Steakhouse,Cheese Shop,Café,Pub,Farmers Market,Seafood Restaurant,Bakery


In [31]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters