In [1]:
#Import pandas
import pandas as pd

In [2]:
#The webpage to scrape 
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
#We need lxml - process html
!pip install lxml



In [4]:
#Our dataframe
wiki_df=pd.read_html(url)[0]

In [5]:
#Checking
wiki_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
#Dropping NA boroughs
wiki_df.drop(wiki_df[wiki_df["Borough"]=="Not assigned"].index, inplace=True)

In [7]:
#Checking again
wiki_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
wiki_df.Neighbourhood.replace(to_replace="Not assigned",value=wiki_df["Borough"], inplace=True)

In [9]:
#Reseting indexes
wiki_df.reset_index(drop=True, inplace=True)
wiki_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [10]:
#Url with geographical coords.
url_geo="http://cocl.us/Geospatial_data"

In [11]:
#Our geo dataframe
geo_df=pd.read_csv(url_geo)

In [12]:
#Checking geo data
geo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Postal Code  103 non-null    object 
 1   Latitude     103 non-null    float64
 2   Longitude    103 non-null    float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [13]:
#Sorting values
wiki_df.sort_values(["Postal Code"], inplace=True)
geo_df.sort_values("Postal Code", inplace=True)
#Reseting indexes
wiki_df.reset_index(drop=True, inplace=True)

In [14]:
#Checking data
wiki_df.tail()

Unnamed: 0,Postal Code,Borough,Neighbourhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
102,M9W,Etobicoke,"Northwest, West Humber - Clairville"


In [15]:
geo_df.tail()

Unnamed: 0,Postal Code,Latitude,Longitude
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437
102,M9W,43.706748,-79.594054


In [16]:
#Assigning sorted columns
wiki_df=wiki_df.assign(Latitude=geo_df.Latitude, Longitude=geo_df.Longitude)

In [17]:
#Checking
wiki_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [18]:
# First map to check
import folium
map_toronto=folium.Map(location=[wiki_df.Latitude[0],wiki_df.Longitude[0]],zoom_start=9)

In [19]:
map_toronto

In [20]:
#Adding our markers
for lat, lng, borough, neighborhood in zip(wiki_df["Latitude"],wiki_df["Longitude"],wiki_df["Borough"],wiki_df["Neighbourhood"]):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

In [21]:
map_toronto

In [22]:
#To explore with Foursquare
CLIENT_ID="5QIWFD2A2YE30B2FLQ2Z30LSBKM3R5CJUMK0XEUZDGIMGB4W"
CLIENT_SECRET="4NXKPDEK3ZXW0F353TJQ3YRE2HEXDXULJV3JQ5YWYSMK5ER1"
VERSION="20200807"

In [23]:
#Checkin and exploring some points
print(wiki_df.loc[[0,3,7],"Neighbourhood"])
print()
print(wiki_df.loc[0,"Latitude"])
print(wiki_df.loc[0,"Longitude"])
print(wiki_df.loc[0,"Neighbourhood"])

0                     Malvern, Rouge
3                             Woburn
7    Golden Mile, Clairlea, Oakridge
Name: Neighbourhood, dtype: object

43.806686299999996
-79.19435340000001
Malvern, Rouge


In [24]:
##Clusters
#Import KMeans
from sklearn.cluster import KMeans
#Dropping values
wiki_df_cluster=wiki_df.drop(columns=["Postal Code","Borough","Neighbourhood"])
#KMeans
kmeans=KMeans(n_clusters=4, random_state=0).fit(wiki_df_cluster)

In [25]:
#Checking labels
kmeans.labels_

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 0, 1, 1, 1, 3, 3, 3, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 0, 3, 0,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

In [26]:
#Adding cluster labels
wiki_df_cluster.insert(0, 'Cluster Labels', kmeans.labels_)


In [27]:
#Checking new dataframe
wiki_df_cluster.head(10)

Unnamed: 0,Cluster Labels,Latitude,Longitude
0,2,43.806686,-79.194353
1,2,43.784535,-79.160497
2,2,43.763573,-79.188711
3,2,43.770992,-79.216917
4,2,43.773136,-79.239476
5,2,43.744734,-79.239476
6,2,43.727929,-79.262029
7,2,43.711112,-79.284577
8,2,43.716316,-79.239476
9,2,43.692657,-79.264848


In [28]:
#Adding neighbourhoods to clusters data
wiki_df_cluster=wiki_df_cluster.assign(Neighbourhood=" ")
wiki_df_cluster=wiki_df_cluster.assign(Neighbourhood=wiki_df.Neighbourhood)

In [29]:
#Checking clusters data
wiki_df_cluster.head(10)

Unnamed: 0,Cluster Labels,Latitude,Longitude,Neighbourhood
0,2,43.806686,-79.194353,"Malvern, Rouge"
1,2,43.784535,-79.160497,"Rouge Hill, Port Union, Highland Creek"
2,2,43.763573,-79.188711,"Guildwood, Morningside, West Hill"
3,2,43.770992,-79.216917,Woburn
4,2,43.773136,-79.239476,Cedarbrae
5,2,43.744734,-79.239476,Scarborough Village
6,2,43.727929,-79.262029,"Kennedy Park, Ionview, East Birchmount Park"
7,2,43.711112,-79.284577,"Golden Mile, Clairlea, Oakridge"
8,2,43.716316,-79.239476,"Cliffside, Cliffcrest, Scarborough Village West"
9,2,43.692657,-79.264848,"Birch Cliff, Cliffside West"


In [30]:
#New map for clusters
map_toronto_clusters=folium.Map(location=[wiki_df.Latitude[0],wiki_df.Longitude[0]],zoom_start=9)
#Importing libs for color/map/clusters
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
#Color scheme for the clusters
x=np.arange(5)
ys=[i + x + (i*x)**2 for i in range(5)]
colors_array=cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow=[colors.rgb2hex(i) for i in colors_array]
#New markers
markers_colors=[]
for lat, lon, poi, cluster in zip(wiki_df_cluster["Latitude"],wiki_df_cluster["Longitude"],wiki_df_cluster["Neighbourhood"],wiki_df_cluster["Cluster Labels"]):
    label=folium.Popup(str(poi) + " Cluster " + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_toronto_clusters)

In [31]:
map_toronto_clusters