<h1 style="text-align:center">Part 1: Web Scrapping and Data Cleaning</h1>

In [10]:
import selenium


from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(ChromeDriverManager().install())

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
driver.get(url)
driver.page_source


Looking for [chromedriver 83.0.4103.39 win32] driver in cache 
File found in cache by path [C:\Users\Kevin\.wdm\drivers\chromedriver\83.0.4103.39\win32\chromedriver.exe]




In [97]:
wikiPage = driver.page_source

from bs4 import BeautifulSoup

soup = BeautifulSoup(wikiPage)

#print(soup.find("tbody"))
#print("--------------------")
#print(soup.prettify())

table = soup.findChildren("tbody")[0]
rows = table.findChildren(["tr"])

dictToronto = {"Postal Code":[], "Borough":[], "Neighborhood":[]}
columnNames = ["Postal Code", "Borough", "Neighborhood"]
for i in range(len(rows)):
    
    cells = rows[i].findChildren(["td"])
    
    for j in range(len(cells)):
        dictToronto[columnNames[j]].append(cells[j].next_element)
    
import pandas as pd

dfToronto = pd.DataFrame.from_dict(dictToronto)

In [117]:
"""Data Cleaning"""

# First we erase trailing spaces
dfToronto = dfToronto.apply(lambda x: x.str.strip())

In [162]:
import numpy as np

# We drop rows that have no Borough assigned
dfToronto = dfToronto.replace("Not assigned", np.nan).dropna(axis = 0, subset=["Borough"])

# Reset index
dfToronto.reset_index(drop=True, inplace = True)

# We check if there is any Neighborhood with value not assigned
print("Neighborhoods with no assigned value: ", (dfToronto["Neighborhood"].isnull()).sum())

Neighborhoods with no assigned value:  0


In [163]:
# We print the shape
print("Shape of the final DataFrame: ", dfToronto.shape)
dfToronto

Shape of the final DataFrame:  (103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


<h1 style="text-align: center">Part 2: Latitudes and Longitudes</h1>

In [238]:
# Geocoder didn't work with any address 21/June/20 13:20 GMT-5

df_latlng = pd.read_csv("Geospatial_Coordinates.csv")

dfToronto_latlng = pd.merge(dfToronto,df_latlng, how="inner", on ="Postal Code")
dfToronto_latlng

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


<h1 style="text-align:center">Part 3: Cluster Analysis</h1>

<h3>Map of Toronto divided by Postal Codes</h3>

In [241]:
# We first get latitude and longitude of Ontario
from geopy import geocoders

geolocator = geocoders.Nominatim(user_agent="Ontario")
location = geolocator.geocode("Toronto, Ontario")
latitude = location.latitude
longitude = location.longitude

In [261]:
import folium

map_toronto = folium.Map(location=[latitude,longitude], zoom_start=10)

for lat,lng,post,borough in zip(dfToronto_latlng["Latitude"],\
                                 dfToronto_latlng["Longitude"],\
                                 dfToronto_latlng["Postal Code"],\
                                 dfToronto_latlng["Borough"]):
    label = "{}: {}".format(post, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
    [lat,lng],
    radius=9,
    popup=label,
    color="blue",
    fill=True,
    fill_color="#3186cc",
    fill_opacity=0.5,
    parse_html=False).add_to(map_toronto)

map_toronto

<h3>Getting Nearby Venues of each Postal Code (within 700 m) using FourSquare API</h3>

In [274]:
CLIENT_ID = 'E10QK2PDE5PDM0RA0GKLMAB1WN2UNEV04ZQ2WD22CVQLXYLA' 
CLIENT_SECRET = 'BJAFBHWMB13LZAPNKWSD3XQT4RMS1YW14MJTF3D4WQYEIRWW' 
VERSION = '20180605'
LIMIT = 100

import requests
def getNearbyVenues(postalCode, latitudes, longitudes, radius = 700):
    
    venues_list = []
    
    for postal,lat,lng in zip(postalCode, latitudes,longitudes):
        print(postal)
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        except:
            print("Error with: ", postal)
            continue
    
        # We append only useful information
        venues_list.append([(
            postal, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby.columns = ['Postal Code', 
                  'Postal Code Latitude', 
                  'Postal Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby)
        
nearbyToronto = getNearbyVenues(postalCode = dfToronto_latlng["Postal Code"],
                              latitudes = dfToronto_latlng["Latitude"],
                              longitudes = dfToronto_latlng["Longitude"])

nearbyToronto

Unnamed: 0,Postal Code,Postal Code Latitude,Postal Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.332140,Park
1,M3A,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,M3A,43.753259,-79.329656,Corrosion Service Company Limited,43.752432,-79.334661,Construction & Landscaping
3,M3A,43.753259,-79.329656,Three Valleys Park,43.751195,-79.337356,Park
4,M4A,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
...,...,...,...,...,...,...,...
3408,M8Z,43.628841,-79.520999,Rocco's Plum Tomato,43.634898,-79.519951,Italian Restaurant
3409,M8Z,43.628841,-79.520999,Kingsway Boxing Club,43.627254,-79.526684,Gym
3410,M8Z,43.628841,-79.520999,Torque Barbell,43.632061,-79.525625,Gym / Fitness Center
3411,M8Z,43.628841,-79.520999,Buon Giorno Cafe,43.622801,-79.519322,Italian Restaurant


<h3>Grouping by venue category and one-hot encoding venue categories</h3>

In [290]:
print("There are {} unique categories for venues".format(len(nearbyToronto["Venue Category"].unique())))

# We get the one-hot encoding for Venue Categories
toronto_onehot = pd.get_dummies(nearbyToronto[["Venue Category"]], prefix="", prefix_sep="")

# We add the Postal Code column at the beginning
toronto_onehot["Postal Code"] = nearbyToronto["Postal Code"]

newColumns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[newColumns]

toronto_onehot.head()

There are 318 unique categories for venues


Unnamed: 0,Postal Code,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<h3>Grouping by postal code</h3>

In [310]:
toronto_grouped = toronto_onehot.groupby("Postal Code").mean().reset_index()

# For the analysis we will get only the top 5
num_top_venues = 5

indicators = ["..","st","nd","rd","dsf"]
addStr = "Most Common Venue"
columnNames = [i+"th "+addStr if int(i) > 3 else i + indicators[int(i)] + " " + addStr for i in list(map(str,range(1,num_top_venues+1)))]

mostCommon = pd.DataFrame(columns = ["Postal Code"] + columnNames)

for post in toronto_grouped["Postal Code"]:
    
    temp = toronto_grouped[toronto_grouped["Postal Code"] == post].T.reset_index().iloc[1:,:]
    
    # Sorting values and changing column names
    temp.columns = ["venue","freq"]
    temp = temp.sort_values(by="freq", ascending = False).head(num_top_venues)[["venue"]].T
    temp.columns = columnNames
    
    mostCommon = mostCommon.append(temp)
    
mostCommon["Postal Code"] = toronto_grouped["Postal Code"].values
ordering = list([mostCommon.columns[-1]]) + list(mostCommon.columns[:-1])
mostCommon = mostCommon[ordering].reset_index(drop=True)
mostCommon

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Fast Food Restaurant,Coffee Shop,Hobby Shop,Bus Station,Spa
1,M1C,Breakfast Spot,Burger Joint,Bar,ATM,Music School
2,M1E,Park,Fast Food Restaurant,Mexican Restaurant,Thrift / Vintage Store,Rental Car Location
3,M1G,Coffee Shop,Park,Business Service,ATM,Music School
4,M1H,Indian Restaurant,Coffee Shop,Fried Chicken Joint,Hakka Restaurant,Thai Restaurant
...,...,...,...,...,...,...
96,M9N,Diner,Pharmacy,Fried Chicken Joint,Park,Modern European Restaurant
97,M9P,Pizza Place,Flea Market,Intersection,Supermarket,Sandwich Place
98,M9R,Sandwich Place,Coffee Shop,Bank,Mobile Phone Shop,Shopping Mall
99,M9V,Grocery Store,Sandwich Place,Pizza Place,Discount Store,Fast Food Restaurant


<h3>Clustering</h3>

In [315]:
kclusters = 5
toronto_cluster = toronto_grouped.drop("Postal Code",1)

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=kclusters, random_state = 0).fit(toronto_cluster)

print("Labels are: ", kmeans.labels_)

Labels are:  [2 0 2 3 2 1 2 2 2 0 2 2 2 2 2 2 2 2 2 3 2 3 2 3 2 2 2 2 2 2 2 2 2 2 0 2 2
 2 2 2 2 2 0 2 2 2 0 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 2
 2 2 2 0 2 2 2 2 2 2 2 2 2 1 2 2 2 0 4 2 2 1 0 1 2 2 2]


In [334]:
# We add the cluster number to each postal code
#mostCommon.insert(0,"Cluster Labels", kmeans.labels_)     QUITARTE ESTOE COM ENUNADFOSDF

mergedToronto = pd.merge(mostCommon, dfToronto_latlng, on="Postal Code", how="outer")
mergedToronto.dropna(subset=["Cluster Labels"], inplace=True) # We drop rows with NaN values in Cluster

In [337]:
# Finally we create the map
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clustered = folium.Map(location=[latitude,longitude], zoom_start = 10)

# Setting a color scheme
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))

rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []

for lat,lon,postal,cluster in zip(mergedToronto["Latitude"],
                                  mergedToronto["Longitude"],
                                  mergedToronto["Postal Code"],
                                  mergedToronto["Cluster Labels"]):
    
    label = folium.Popup(str(postal) + " Cluster " + str(cluster),parse_html=True)
    folium.CircleMarker([lat,lon],
                       radius=9,
                       popup=label,
                       color=rainbow[int(cluster-1)],
                       fill=True,
                       fill_color=rainbow[int(cluster-1)],
                       fill_opacity=0.5).add_to(map_clustered)

map_clustered

In [338]:
mergedToronto.sort_values(by="Cluster Labels")

Unnamed: 0,Cluster Labels,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Borough,Neighborhood,Latitude,Longitude
72,0.0,M6E,Park,Gym,Women's Store,Bakery,Sporting Goods Shop,York,Caledonia-Fairbanks,43.689026,-79.453512
1,0.0,M1C,Breakfast Spot,Burger Joint,Bar,ATM,Music School,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
34,0.0,M4C,Skating Rink,Park,Athletics & Sports,Beer Store,Pharmacy,East York,Woodbine Heights,43.695344,-79.318389
77,0.0,M6L,Home Service,Business Service,Construction & Landscaping,Bakery,Park,North York,"North Park, Maple Leaf Park, Upwood Park",43.713756,-79.490074
96,0.0,M9N,Diner,Pharmacy,Fried Chicken Joint,Park,Modern European Restaurant,York,Weston,43.706876,-79.518188
62,0.0,M5P,Trail,Park,Jewelry Store,Sushi Restaurant,Gym / Fitness Center,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
42,0.0,M4N,Business Service,Park,Bus Line,Swim School,Electronics Store,Central Toronto,Lawrence Park,43.72802,-79.38879
46,0.0,M4T,Park,Playground,Tennis Court,Gym,Grocery Store,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
91,0.0,M9A,Pharmacy,Grocery Store,Playground,Shopping Mall,Bank,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
9,0.0,M1N,College Stadium,Park,General Entertainment,Diner,Skating Rink,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
