# read wiki page to dataframe

In [191]:
import pandas as pd
from geopy.geocoders import Nominatim
import warnings
warnings.filterwarnings("ignore")
import requests
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [192]:
tables=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
df=tables[0]

df.shape
#df.head()

(180, 3)

### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [157]:
df=df[df["Borough"]!="Not assigned"]
df.shape

(103, 3)

### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [158]:
# there is no neighborhood as "Not assigned" after removing "Not assigned" Borough

list=df[df["Neighbourhood"] == 'Not assigned']

len(list)


0

### After data clean, there are 103 rows left in table.

In [159]:
df.shape

(103, 3)

In [160]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# Add geo info of postal code

In [161]:
df_geo=pd.read_csv("C:/Users/jingh/Downloads/Geospatial_Coordinates.csv")

In [162]:
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [163]:
df_merge = pd.merge(df, df_geo, on='Postal Code')

In [164]:
df_merge.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Explore and cluster the neighborhoods in Toronto

### Data set explore

In [165]:
#!conda install -c conda-forge folium=0.5.0 --yes 
import folium 

In [166]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [167]:
# filter out neighborhoods in Toronto and rename Neighbourhood to Neighborhood
df_Toronto=df_merge[df_merge['Borough'].str.contains('Toronto')]
df_Toronto.rename(columns={'Neighbourhood':'Neighborhood'},inplace = True)

In [168]:
df_Toronto.reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


#### Neighbor distribution in Toronto

In [169]:
# create map of Toronto using latitude and longitude values

map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

### Define Foursquare Credentials and Version

In [170]:
CLIENT_ID = 'YFYZIO4DCIHU2MIYEAZ0QP05CKWF4HA3H5KZLXTAGZUQHR3J' # your Foursquare ID
CLIENT_SECRET = 'JIPMNLBLVO1TLUUOOXYV5SKVRU3VXAZ3B1IBVZISCEGB342L' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 
radius = 500

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: YFYZIO4DCIHU2MIYEAZ0QP05CKWF4HA3H5KZLXTAGZUQHR3J
CLIENT_SECRET:JIPMNLBLVO1TLUUOOXYV5SKVRU3VXAZ3B1IBVZISCEGB342L


### Explore Neighborhoods in Toronto

In [171]:
def getNearbyVenues(borough,names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for bor,name, lat, lng in zip(borough,names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            bor,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough',
                             'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [172]:
# run the above function on each neighborhood and create a new dataframe called Toronto_venues.
Toronto_venues = getNearbyVenues(borough=df_Toronto['Borough'],
                                 names=df_Toronto['Neighborhood'],
                                   latitudes=df_Toronto['Latitude'],
                                   longitudes=df_Toronto['Longitude']
                                   
                                  )

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West, Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
R

In [173]:
print(Toronto_venues.shape)
Toronto_venues.head()

(1647, 8)


Unnamed: 0,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [174]:
Toronto_venues.groupby("Venue Category").count().sort_values(by="Neighborhood",ascending=False)

Unnamed: 0_level_0,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Coffee Shop,155,155,155,155,155,155,155
Café,92,92,92,92,92,92,92
Restaurant,54,54,54,54,54,54,54
Italian Restaurant,38,38,38,38,38,38,38
Hotel,38,38,38,38,38,38,38
...,...,...,...,...,...,...,...
Garden Center,1,1,1,1,1,1,1
Fruit & Vegetable Store,1,1,1,1,1,1,1
Frozen Yogurt Shop,1,1,1,1,1,1,1
Flea Market,1,1,1,1,1,1,1


### filter out coffee shop

In [175]:
Toronto_coffee=Toronto_venues[Toronto_venues['Venue Category']=="Coffee Shop"]
Toronto_coffee.drop("Venue Category",axis=1,inplace=True)
Toronto_coffee.shape

(155, 7)

### Check how many coffee shop brands

In [176]:
Toronto_coffee['Venue'].value_counts()

Starbucks                            53
Tim Hortons                          25
Pilot Coffee Roasters                10
Dineen @CommerceCourt                 5
Balzac's Coffee                       4
M Square Coffee Co                    4
Everyday Gourmet (Teas & Coffees)     3
Second Cup                            3
Bulldog On The Block                  3
Dark Horse Espresso Bar               2
Versus Coffee                         2
Jimmy's Coffee                        2
Hailed Coffee                         2
Neo Coffee Bar                        2
Aroma Espresso Bar                    2
Mos Mos                               2
Cafe Frappe                           1
Tandem Coffee                         1
Tokyo Smoke                           1
I Deal Coffee                         1
Rooster Coffee House                  1
Arvo                                  1
Balzac’s Coffee Roasters              1
Little Pebbles                        1
Sumach Espresso                       1


In [177]:
# We want to avoid to open my coffee shop near Starbucks
Toronto_target=Toronto_coffee[Toronto_coffee['Venue']!="Starbucks"]
Toronto_target.shape

(102, 7)

In [178]:
Toronto_target['Borough'].describe()

count                  102
unique                   4
top       Downtown Toronto
freq                    84
Name: Borough, dtype: object

### Cluster Neighborhoods

In [179]:
# Run k-means to cluster the existing coffee shop into 5 clusters.
kclusters = 5

Toronto_grouped_clustering = Toronto_target.drop(['Borough','Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue'], 1)

Toronto_grouped_clustering

 

Unnamed: 0,Venue Latitude,Venue Longitude
1,43.653559,-79.361809
13,43.658135,-79.359515
14,43.651900,-79.365609
16,43.649963,-79.361442
35,43.653081,-79.357078
...,...,...
1544,43.644373,-79.383065
1576,43.668286,-79.382520
1593,43.669654,-79.379871
1620,43.664167,-79.380149


In [180]:
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 4, 4, 4, 4, 4, 1, 0, 1, 1])

In [181]:
Toronto_target.insert(6,'Cluster Labels', kmeans.labels_)
Toronto_target.reset_index(drop=True)

Unnamed: 0,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Cluster Labels,Venue Longitude
0,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,Tandem Coffee,43.653559,4,-79.361809
1,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,Sumach Espresso,43.658135,4,-79.359515
2,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,Rooster Coffee,43.651900,4,-79.365609
3,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,Arvo,43.649963,4,-79.361442
4,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,Dark Horse Espresso Bar,43.653081,4,-79.357078
...,...,...,...,...,...,...,...,...
97,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.382280,Balzac's Coffee,43.644373,1,-79.383065
98,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,Piedmont Coffee Bar,43.668286,1,-79.382520
99,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,Rooster Coffee House,43.669654,1,-79.379871
100,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,The Drink,43.664167,1,-79.380149


In [182]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_target['Venue Latitude'], Toronto_target['Venue Longitude'],Toronto_target['Venue'], Toronto_target['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examine Clusters

#### Cluster 1:

In [183]:
Cluster1=Toronto_target.loc[Toronto_target['Cluster Labels'] == 0]
Cluster1.shape

(11, 8)

#### Cluster 2:

In [184]:
Cluster2=Toronto_target.loc[Toronto_target['Cluster Labels'] == 1]
Cluster2.shape

(70, 8)

#### Cluster 3: 

In [185]:
Cluster3=Toronto_target.loc[Toronto_target['Cluster Labels'] == 2]
Cluster3.shape

(4, 8)

#### Cluster 4:

In [186]:
Cluster4=Toronto_target.loc[Toronto_target['Cluster Labels'] == 3]
Cluster4.shape

(3, 8)

#### Cluster 5

In [187]:
Cluster5=Toronto_target.loc[Toronto_target['Cluster Labels'] == 4]
Cluster5 .shape

(14, 8)

### Find out the Neighborhood of cluster1

In [188]:
Cluster2['Borough'].unique()

array(['Downtown Toronto'], dtype=object)

In [189]:
Cluster2['Neighborhood'].unique()

array(["Queen's Park, Ontario Provincial Government",
       'Garden District, Ryerson', 'St. James Town', 'Berczy Park',
       'Central Bay Street', 'Richmond, Adelaide, King',
       'Harbourfront East, Union Station, Toronto Islands',
       'Toronto Dominion Centre, Design Exchange',
       'Commerce Court, Victoria Hotel',
       'CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport',
       'Stn A PO Boxes', 'First Canadian Place, Underground city',
       'Church and Wellesley'], dtype=object)

In [190]:
Cluster2.groupby("Neighborhood").count().sort_values(by="Venue",ascending=False)

Unnamed: 0_level_0,Borough,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Cluster Labels,Venue Longitude
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Central Bay Street,8,8,8,8,8,8,8
"Commerce Court, Victoria Hotel",8,8,8,8,8,8,8
"First Canadian Place, Underground city",8,8,8,8,8,8,8
"Harbourfront East, Union Station, Toronto Islands",7,7,7,7,7,7,7
"Toronto Dominion Centre, Design Exchange",7,7,7,7,7,7,7
Stn A PO Boxes,6,6,6,6,6,6,6
"Garden District, Ryerson",5,5,5,5,5,5,5
"Richmond, Adelaide, King",5,5,5,5,5,5,5
St. James Town,5,5,5,5,5,5,5
Church and Wellesley,4,4,4,4,4,4,4
