# PART I: Create dataframe with Toronto postal codes, boroughs and neighborhoods

In [6]:
# import libraries
import numpy as np
import pandas as pd
import requests
# import beautiful soup library for scraping
from bs4 import BeautifulSoup
print('libraries imported')

libraries imported


Now let's get the content of the Wikipedia page:

In [24]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all('table')
df = pd.read_html(str(table))[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [26]:
# drop cells with borough unassigned
df=df[df.Borough != 'Not assigned']
df=df.reset_index(drop=True)
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Now check for duplicates in postal code:

In [37]:
x=pd.DataFrame(df['Postal Code'].duplicated(keep=False))
dups_postal = x.pivot_table(index=['Postal Code'], aggfunc='size')
print (dups_postal)

Postal Code
False    103
dtype: int64


So there is no duplicate as all rows of the dataframe df occur once. 
Now check for unassigned neighborhoods:

In [41]:
# show the unassigned neighborhoods in the dataframe
for i in range(len(df)):
    if df.loc[i, 'Neighbourhood']==' Not assigned':
        print(df.loc[i, 'Neighbourhood'])

So that means that we don't have any unassigned neighborhoods

In [110]:
# Once again, the dataframe
df.head(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [44]:
# And the shape:
df.shape

(103, 3)

# PART II: Get coordinates into dataframe

In [132]:
#conda install -c conda-forge geocoder

Now geocoder is installed, let's try if it works:

In [90]:
# import geocoder
g = geocoder.arcgis('M7A')
g.latlng

[43.66253000000006, -79.39187999999996]

It works, so let's get the coordinates into the dataframe

In [125]:
len(df)

103

In [124]:
# create arrays for latitudes and longitudes an fill them with coordinates per postcode
lat = []
lng = []
i=0
while i < 103:
    postcode=df.iloc[i,0]
    g = geocoder.arcgis(postcode)
    lat.append(g.lat)
    lng.append(g.lng)
    i+=1

Now append the arrays to the dataframe

In [134]:
df['Latitude']=lat
df['Longitude']=lng
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.66263,-79.52831
6,M1B,Scarborough,"Malvern, Rouge",43.81139,-79.19662
7,M3B,North York,Don Mills,43.74923,-79.36186
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.70718,-79.31192
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804


# PART III: Explore and cluster the neighborhoods in Toronto

#### Let's import key libraries and define the key variables for foursquare request

In [133]:
# import key libraries
import json
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
print('libraries imported')

libraries imported


In [147]:
CLIENT_ID = 'FVIETOXJYMU1BIRIGDMMSJJ5SVWFZQAZTV2Y3OV5ZASLFEOC'
CLIENT_SECRET = 'DAGS3WANGMVRDVBUJ3KRJU3AU2VPJVTXPXDJORJXL3FMUECX' 
VERSION = '20180605' 
LIMIT = 100 # we want 100 results per neighbourhood
radius = 1000 # we want to limit the radius from neighbourhood centre to 1km

## Create function to get nearby venues for a given neighborhood

In [148]:
def getNearbyVenues(names, latitudes, longitudes, radius=radius):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Now for each neighbourhood, we get nearby venues and put that into a new dataframe:

In [149]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )
print(toronto_venues.shape)
toronto_venues.head(10)

(5062, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.75245,-79.32991,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.75245,-79.32991,Brookbanks Park,43.751976,-79.33214,Park
2,Parkwoods,43.75245,-79.32991,Tim Hortons,43.760668,-79.326368,Café
3,Parkwoods,43.75245,-79.32991,Bruno's valu-mart,43.746143,-79.32463,Grocery Store
4,Parkwoods,43.75245,-79.32991,A&W,43.760643,-79.326865,Fast Food Restaurant
5,Parkwoods,43.75245,-79.32991,Shoppers Drug Mart,43.745315,-79.3258,Pharmacy
6,Parkwoods,43.75245,-79.32991,High Street Fish & Chips,43.74526,-79.324949,Fish & Chips Shop
7,Parkwoods,43.75245,-79.32991,Food Basics,43.760549,-79.326045,Supermarket
8,Parkwoods,43.75245,-79.32991,Variety Store,43.751974,-79.333114,Food & Drink Shop
9,Parkwoods,43.75245,-79.32991,Pizza Pizza,43.760231,-79.325666,Pizza Place


In [150]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,46,46,46,46,46,46
"Alderwood, Long Branch",29,29,29,29,29,29
"Bathurst Manor, Wilson Heights, Downsview North",31,31,31,31,31,31
Bayview Village,8,8,8,8,8,8
"Bedford Park, Lawrence Manor East",35,35,35,35,35,35
...,...,...,...,...,...,...
"Willowdale, Willowdale West",16,16,16,16,16,16
Woburn,17,17,17,17,17,17
Woodbine Heights,68,68,68,68,68,68
York Mills West,20,20,20,20,20,20


#### Now do onehotencoding and clustering by category

In [176]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[216]] + list(toronto_onehot.columns[:216])+list(toronto_onehot.columns[217:])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,Arcade,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now group by neighborhood once again

In [181]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print(toronto_onehot.shape)
print(toronto_grouped.shape)
toronto_grouped.head()

(5062, 337)
(98, 337)


Unnamed: 0,Neighborhood,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,Arcade,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.021739,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.032258,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,...,0.0,0.028571,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0


## Extract most frequent venue types for a given neihghborhood:

In [288]:
popular_places=dict.fromkeys(range(98), [])
index=0
while index<98:
    a=toronto_grouped.iloc[[index],1:].sort_values(by=index,axis=1, ascending=False)
    b=pd.DataFrame(toronto_grouped.iloc[[index],0])
    b=a.iloc[:, 0:10]
    key=toronto_grouped.iloc[[index],0]
    top_venues= np.asarray(b.columns)
    popular_places[index]=top_venues
    index+=1

Convert dictionary to dataframe

In [292]:
pd_popular_places=pd.DataFrame.from_dict(popular_places, orient='index')
pd_popular_places.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Chinese Restaurant,Shopping Mall,Sandwich Place,Coffee Shop,Print Shop,Skating Rink,Cantonese Restaurant,Mediterranean Restaurant,Shanghai Restaurant,Filipino Restaurant
1,Pizza Place,Coffee Shop,Pharmacy,Bank,Seafood Restaurant,Toy / Game Store,Sandwich Place,Bar,Café,Beer Store
2,Gas Station,Bank,Coffee Shop,Bus Line,Sushi Restaurant,Baseball Field,Mediterranean Restaurant,Shopping Mall,Middle Eastern Restaurant,Mobile Phone Shop
3,Park,Bank,Café,Chinese Restaurant,Trail,Flower Shop,Japanese Restaurant,Noodle House,Paintball Field,Pastry Shop
4,Italian Restaurant,Coffee Shop,Sandwich Place,Juice Bar,Park,Liquor Store,Locksmith,Cosmetics Shop,Pharmacy,Comfort Food Restaurant


In [302]:
pd_popular_places.insert(loc=0, column='Neighborhood', value=toronto_grouped['Neighborhood'])
pd_popular_places.head()

Unnamed: 0,Neighborhood,0,1,2,3,4,5,6,7,8,9
0,Agincourt,Chinese Restaurant,Shopping Mall,Sandwich Place,Coffee Shop,Print Shop,Skating Rink,Cantonese Restaurant,Mediterranean Restaurant,Shanghai Restaurant,Filipino Restaurant
1,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Pharmacy,Bank,Seafood Restaurant,Toy / Game Store,Sandwich Place,Bar,Café,Beer Store
2,"Bathurst Manor, Wilson Heights, Downsview North",Gas Station,Bank,Coffee Shop,Bus Line,Sushi Restaurant,Baseball Field,Mediterranean Restaurant,Shopping Mall,Middle Eastern Restaurant,Mobile Phone Shop
3,Bayview Village,Park,Bank,Café,Chinese Restaurant,Trail,Flower Shop,Japanese Restaurant,Noodle House,Paintball Field,Pastry Shop
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Sandwich Place,Juice Bar,Park,Liquor Store,Locksmith,Cosmetics Shop,Pharmacy,Comfort Food Restaurant


In [313]:
pd_popular_places.columns=['Neighborhood','1st','2nd','3rd','4th','5th','6th','7th','8th','9th','10th']
pd_popular_places.head()

Unnamed: 0,Neighborhood,1st,2nd,3rd,4th,5th,6th,7th,8th,9th,10th
0,Agincourt,Chinese Restaurant,Shopping Mall,Sandwich Place,Coffee Shop,Print Shop,Skating Rink,Cantonese Restaurant,Mediterranean Restaurant,Shanghai Restaurant,Filipino Restaurant
1,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Pharmacy,Bank,Seafood Restaurant,Toy / Game Store,Sandwich Place,Bar,Café,Beer Store
2,"Bathurst Manor, Wilson Heights, Downsview North",Gas Station,Bank,Coffee Shop,Bus Line,Sushi Restaurant,Baseball Field,Mediterranean Restaurant,Shopping Mall,Middle Eastern Restaurant,Mobile Phone Shop
3,Bayview Village,Park,Bank,Café,Chinese Restaurant,Trail,Flower Shop,Japanese Restaurant,Noodle House,Paintball Field,Pastry Shop
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Sandwich Place,Juice Bar,Park,Liquor Store,Locksmith,Cosmetics Shop,Pharmacy,Comfort Food Restaurant


## Now cluster the neighborhoods

In [315]:
# set number of clusters
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 4, 1, 1, 4, 1, 1, 1], dtype=int32)

##### Crate dataframe that includes neighborhood, cluster as well as coordinates

In [316]:
# add clustering labels
pd_popular_places.insert(0, 'Cluster Labels', kmeans.labels_)

In [317]:
toronto_merged = df

# merge toronto_grouped with df to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(pd_popular_places.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st,2nd,3rd,4th,5th,6th,7th,8th,9th,10th
0,M3A,North York,Parkwoods,43.75245,-79.32991,4.0,Park,Bus Stop,Convenience Store,Café,Caribbean Restaurant,Food & Drink Shop,Supermarket,Chinese Restaurant,Grocery Store,Fish & Chips Shop
1,M4A,North York,Victoria Village,43.73057,-79.31306,1.0,Bus Line,Middle Eastern Restaurant,Thai Restaurant,Thrift / Vintage Store,Coffee Shop,Mediterranean Restaurant,Pizza Place,Hockey Arena,Park,Portuguese Restaurant
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264,1.0,Coffee Shop,Café,Restaurant,Park,Theater,Bakery,Thai Restaurant,Pub,Diner,Gym / Fitness Center
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042,1.0,Clothing Store,Coffee Shop,Restaurant,Dessert Shop,Fast Food Restaurant,Furniture / Home Store,Greek Restaurant,Fried Chicken Joint,Grocery Store,Cosmetics Shop
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188,1.0,Coffee Shop,Restaurant,Italian Restaurant,Sushi Restaurant,Café,Park,Gastropub,Japanese Restaurant,Pizza Place,Clothing Store


In [341]:
toronto_merged["Cluster Labels"]=toronto_merged["Cluster Labels"].fillna(0.0).astype(int)
toronto_merged.dtypes

Postal Code        object
Borough            object
Neighbourhood      object
Latitude          float64
Longitude         float64
Cluster Labels      int64
1st                object
2nd                object
3rd                object
4th                object
5th                object
6th                object
7th                object
8th                object
9th                object
10th               object
dtype: object

## Finally, let's create a map to visualize all this:

In [343]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters