In [379]:
import numpy as np
import pandas as pd
import json
from geopy.geocoders import Nominatim 
import requests
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
print('Libraries imported.')

Libraries imported.


# 1. Build Toronto Postal Codes Dataframe

In [169]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

### Extract Wikipedia table using Beautiful Soup

In [423]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,'lxml')
#Build table object
My_table = soup.find('table',{'class':'wikitable sortable'})
whole_list=My_table.findAll("tr")
postalcode=[]
borough=[]
neighborhood=[]

### Loop to copy and paste each beautiful soup cell object into lists|

In [424]:
i=0
for tr in My_table.findAll("tr"):
    sub_list=whole_list[i].findAll("td")
    if sub_list :
            postalcode.append(sub_list[0].get_text())
            borough.append(sub_list[1].get_text()) 
            spec_list=sub_list[2].string
            if spec_list!="Not assigned\n" :
                    spec_list=sub_list[2].findAll("a")
                    if spec_list :
                        neighborhood.append(spec_list[0].get('title'))
                    else:
                        neighborhood.append(sub_list[2].get_text().strip()) 
            else:
                    neighborhood.append(sub_list[2].get_text().strip())
    i=i+1

### Create and format dataframe from previous extracted data

In [425]:
#Create dataframe with data from previous 
df=pd.DataFrame()
df['PostalCode']=postalcode
df['Borough']=borough
df['Neighborhood']=neighborhood

#Remove from dataframe (Toronto) and ", Toronto"
df['Neighborhood']=df.loc[ : , 'Neighborhood' ].str.replace(r"\(.*\)","").str.replace(', Toronto','')

#Remove from dataframe "Not assigned" Borough rows
dt=df[df.Borough!= 'Not assigned']
dt = dt.reset_index(drop=True)

#Loop to copy Borough in "Not assigned" Neighborhood cells 
for idx, row in dt.iterrows():
    if dt['Neighborhood'].iloc[idx]=="Not assigned":
        dt.loc[idx,'Neighborhood']=dt.loc[idx,'Borough']

### Return Toronto Postal Codes Dataframe combining together Neighborhood with same postal code

In [426]:
dt = dt.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
dt.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek , Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Woburn
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [427]:
dt.shape

(103, 3)

# 2. Add the Latitude and Longitude to Toronto Postal Codes Dataframe

### Load geospatial csv , Geocoder Python package was not working 

In [386]:
Toronto_Lat_Lon = pd.read_csv('http://cocl.us/Geospatial_data')
dt['Latitude'] = ''
dt['Longitude'] = ''

In [387]:
!wget -q -O 'toronto.json' https://cocl.us/Geospatial_data
print('Data downloaded!')

/bin/sh: wget: command not found
Data downloaded!


### Function returning Latitude based on postal code 

In [388]:
def Geosp_Lat(Coord):
    for idx, row in Toronto_Lat_Lon.iterrows():
        if Toronto_Lat_Lon['Postal Code'].iloc[idx]==Coord:
            Lat=Toronto_Lat_Lon.loc[idx,'Latitude']
            return Lat;

### Function returning Longitude based on postal code 

In [389]:

def Geosp_Lon(Coord):
    for idx, row in Toronto_Lat_Lon.iterrows():
        if Toronto_Lat_Lon['Postal Code'].iloc[idx]==Coord:
            Lon=Toronto_Lat_Lon.loc[idx,'Longitude']
            return Lon;

### Assign Latitude and Longitude to each postal code using previous functions

In [390]:
for idx, row in dt.iterrows():
    PC_toSearch=(dt['PostalCode'].iloc[idx])
    dt.loc[idx,'Latitude']=Geosp_Lat(PC_toSearch)
    dt.loc[idx,'Longitude']=Geosp_Lon(PC_toSearch)

In [391]:
dt.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944
1,M1C,Scarborough,"Highland Creek , Rouge Hill, Port Union",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Woburn,43.7731,-79.2395


# 3.Build a Map of Toronto with markers

### Return latitude, longitude of Toronto

In [392]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Ontario are 43.653963, -79.387207.


In [256]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, PostalCode,borough in zip(dt['Latitude'], dt['Longitude'], dt['PostalCode'], dt['Borough']):
    label = '{}, {}'.format(PostalCode, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# 4.Build a Map of Downtown Toronto with markers 

### Build Downtown Toronto dataframe

In [395]:
Downtown_Toronto_data=dt[dt['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
Downtown_Toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.6796,-79.3775
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.668,-79.3677
2,M4Y,Downtown Toronto,Church and Wellesley,43.6659,-79.3832
3,M5A,Downtown Toronto,"Harbourfront , Regent Park",43.6543,-79.3606
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.6572,-79.3789
5,M5C,Downtown Toronto,St. James Town,43.6515,-79.3754
6,M5E,Downtown Toronto,Berczy Park,43.6448,-79.3733
7,M5G,Downtown Toronto,Central Bay Street,43.658,-79.3874
8,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.6506,-79.3846
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Stat...",43.6408,-79.3818


### Return latitude, longitude of Downtown Toronto

In [396]:
address = 'Downtown Toronto, Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6541737, -79.3808116451341.


### Create map of Downtown Toronto with markers using latitude and longitude values

In [None]:
map_DowntownToronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, PostalCode,borough in zip(Downtown_Toronto_data['Latitude'], Downtown_Toronto_data['Longitude'], Downtown_Toronto_data['PostalCode'], Downtown_Toronto_data['Borough']):
    label = '{}, {}'.format(PostalCode, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_DowntownToronto)  
    
map_DowntownToronto

# 5.Explore Downtown Toronto using foursquare

### Foursquare user credentials

In [526]:
CLIENT_ID = '5DF0SUHJHNTIYJQZRRDQTPFZBF3L4MG1BPX14MUTWOGECNSQ' # your Foursquare ID
CLIENT_SECRET = 'BHAFZLAHSXXXUPUQJSY0OJISSCJOULN2KHHG41ORN0POBOUV' # your Foursquare Secret
VERSION = '20190104'
LIMIT = 100
radius=400

### Function returning venues from Foursquare 

In [527]:
 def getNearbyVenues(names, latitudes, longitudes, radius=300):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Building Downtown Toronto venues dataframe using previous function

In [528]:
DowntownToronto_venues = getNearbyVenues(names=Downtown_Toronto_data['PostalCode'],
                                   latitudes=Downtown_Toronto_data['Latitude'],
                                   longitudes=Downtown_Toronto_data['Longitude']
                                  )

In [529]:
DowntownToronto_venues.head()

Unnamed: 0,PostalCode,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4W,43.679563,-79.377529,Park Drive Reservation Lands,43.679822,-79.377787,Park
1,M4W,43.679563,-79.377529,Mooredale House,43.678631,-79.380091,Building
2,M4X,43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner
3,M4X,43.667967,-79.367675,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant
4,M4X,43.667967,-79.367675,F'Amelia,43.667536,-79.368613,Italian Restaurant


### Top 10 Downtown Toronto (by postal code) venues dataframe 

In [518]:
# one hot encoding
DowntownToronto_onehot = pd.get_dummies(DowntownToronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add pc column back to dataframe
DowntownToronto_onehot['PostalCode'] = DowntownToronto_venues['PostalCode'] 

# move pc column to the first column
fixed_columns = [DowntownToronto_onehot.columns[-1]] + list(DowntownToronto_onehot.columns[:-1])
DowntownToronto_onehot = DowntownToronto_onehot[fixed_columns]


In [519]:
DowntownToronto_grouped = DowntownToronto_onehot.groupby('PostalCode').mean().reset_index()

In [520]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [521]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
PostalCode_venues_sorted = pd.DataFrame(columns=columns)
PostalCode_venues_sorted['PostalCode'] = DowntownToronto_grouped['PostalCode']

for ind in np.arange(DowntownToronto_grouped.shape[0]):
    PostalCode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(DowntownToronto_grouped.iloc[ind, :], num_top_venues)

In [522]:
PostalCode_venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Park,Building,Yoga Studio,Donut Shop,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Ethiopian Restaurant,Dumpling Restaurant
1,M4X,Pizza Place,Coffee Shop,Restaurant,Café,Outdoor Sculpture,Japanese Restaurant,Butcher,Sandwich Place,Chinese Restaurant,Liquor Store
2,M4Y,Gay Bar,Burger Joint,Japanese Restaurant,Coffee Shop,Nightclub,Middle Eastern Restaurant,Mexican Restaurant,Park,Men's Store,General Entertainment
3,M5A,Gym / Fitness Center,Food Truck,History Museum,Breakfast Spot,Spa,Coffee Shop,Bus Stop,Theater,Furniture / Home Store,Bakery
4,M5B,Coffee Shop,Café,Middle Eastern Restaurant,Clothing Store,Hotel,Sandwich Place,Movie Theater,Tea Room,Burrito Place,Restaurant


# 6.Run k-means to cluster Downtown Toronto Postal Codes into 5 clusters.

In [523]:
# set number of clusters
kclusters = 5

DowntownToronto_grouped_clustering = DowntownToronto_grouped.drop('PostalCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(DowntownToronto_grouped_clustering)


In [524]:
# add clustering labels
PostalCode_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

DowntownToronto_merged = Downtown_Toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each pc
DowntownToronto_merged = DowntownToronto_merged.join(PostalCode_venues_sorted.set_index('PostalCode'), on='PostalCode')

DowntownToronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.6796,-79.3775,2,Park,Building,Yoga Studio,Donut Shop,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Ethiopian Restaurant,Dumpling Restaurant
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.668,-79.3677,1,Pizza Place,Coffee Shop,Restaurant,Café,Outdoor Sculpture,Japanese Restaurant,Butcher,Sandwich Place,Chinese Restaurant,Liquor Store
2,M4Y,Downtown Toronto,Church and Wellesley,43.6659,-79.3832,1,Gay Bar,Burger Joint,Japanese Restaurant,Coffee Shop,Nightclub,Middle Eastern Restaurant,Mexican Restaurant,Park,Men's Store,General Entertainment
3,M5A,Downtown Toronto,"Harbourfront , Regent Park",43.6543,-79.3606,1,Gym / Fitness Center,Food Truck,History Museum,Breakfast Spot,Spa,Coffee Shop,Bus Stop,Theater,Furniture / Home Store,Bakery
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.6572,-79.3789,1,Coffee Shop,Café,Middle Eastern Restaurant,Clothing Store,Hotel,Sandwich Place,Movie Theater,Tea Room,Burrito Place,Restaurant


In [525]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(DowntownToronto_merged['Latitude'], DowntownToronto_merged['Longitude'], DowntownToronto_merged['PostalCode'], DowntownToronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# 7.Analyse Clusters


### Cluster1 venue category shows evidence of residential area

In [530]:
DowntownToronto_merged.loc[DowntownToronto_merged['Cluster Labels'] == 0, DowntownToronto_merged.columns[[1] + list(range(5, DowntownToronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,Downtown Toronto,0,Grocery Store,Café,Gym / Fitness Center,Coffee Shop,Candy Store,Yoga Studio,Donut Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market


###  The selective venue category for Cluster2 is Eateries

In [531]:
DowntownToronto_merged.loc[DowntownToronto_merged['Cluster Labels'] == 1, DowntownToronto_merged.columns[[1] + list(range(5, DowntownToronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,1,Pizza Place,Coffee Shop,Restaurant,Café,Outdoor Sculpture,Japanese Restaurant,Butcher,Sandwich Place,Chinese Restaurant,Liquor Store
2,Downtown Toronto,1,Gay Bar,Burger Joint,Japanese Restaurant,Coffee Shop,Nightclub,Middle Eastern Restaurant,Mexican Restaurant,Park,Men's Store,General Entertainment
3,Downtown Toronto,1,Gym / Fitness Center,Food Truck,History Museum,Breakfast Spot,Spa,Coffee Shop,Bus Stop,Theater,Furniture / Home Store,Bakery
4,Downtown Toronto,1,Coffee Shop,Café,Middle Eastern Restaurant,Clothing Store,Hotel,Sandwich Place,Movie Theater,Tea Room,Burrito Place,Restaurant
5,Downtown Toronto,1,Coffee Shop,Italian Restaurant,Gastropub,Restaurant,BBQ Joint,Japanese Restaurant,Hotel,Yoga Studio,Pizza Place,Performing Arts Venue
7,Downtown Toronto,1,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Deli / Bodega,Spa,Bookstore,Modern European Restaurant,Thai Restaurant,Smoothie Shop
8,Downtown Toronto,1,Steakhouse,Asian Restaurant,American Restaurant,Coffee Shop,Japanese Restaurant,Seafood Restaurant,Bar,Thai Restaurant,Café,Pizza Place
9,Downtown Toronto,1,Coffee Shop,Café,Park,Hotel,Liquor Store,Sandwich Place,Restaurant,Boat or Ferry,Sports Bar,Italian Restaurant
10,Downtown Toronto,1,Coffee Shop,Deli / Bodega,Restaurant,Bakery,Café,American Restaurant,Thai Restaurant,Salad Place,Gluten-free Restaurant,Chinese Restaurant
11,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Deli / Bodega,Hotel,Gastropub,Salad Place,Burger Joint,Steakhouse,Bakery


###  The selective venue category for Cluster3 is Parks

In [532]:
DowntownToronto_merged.loc[DowntownToronto_merged['Cluster Labels'] == 2, DowntownToronto_merged.columns[[1] + list(range(5, DowntownToronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,2,Park,Building,Yoga Studio,Donut Shop,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Ethiopian Restaurant,Dumpling Restaurant


###  The selective venue category for Cluster4 is Airport

In [533]:
DowntownToronto_merged.loc[DowntownToronto_merged['Cluster Labels'] == 3, DowntownToronto_merged.columns[[1] + list(range(5, DowntownToronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,3,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,Yoga Studio,Dumpling Restaurant,Food Court,Flower Shop,Fish Market,Fish & Chips Shop


###  The selective venue category for Cluster5 is Culture

In [534]:
DowntownToronto_merged.loc[DowntownToronto_merged['Cluster Labels'] == 4, DowntownToronto_merged.columns[[1] + list(range(5, DowntownToronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Downtown Toronto,4,Concert Hall,Liquor Store,Steakhouse,Pub,Italian Restaurant,Belgian Restaurant,Beer Bar,Dumpling Restaurant,Fish Market,Fish & Chips Shop
