# Part I: Web Scrapping

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
pg = requests.get(url).content

In [4]:
soup = BeautifulSoup(pg)

In [5]:
n = len(soup.table.find_all("tr")[0].find_all("th"))
ColList = []
for i in range(0,n):
    ColList.append(soup.table.find_all("tr")[0].find_all("th")[i].contents[0].rstrip('\n'))

In [6]:
df = pd.DataFrame(columns=ColList)

In [7]:
k = len(soup.table.find_all("tr"))
for i in range(1,k):
    Ob = []
    for j in range(0,n):
        new = soup.table.find_all("tr")[i].find_all("td")[j].contents[0].rstrip('\n')
        Ob.append(new)
    #print(Ob)
    if Ob[1] == 'Not assigned':
        pass
    else:
        if Ob[2] == 'Not assigned':
            Ob[2] = Ob[1]
        df= df.append({ColList[0]:Ob[0],ColList[1]:Ob[1],ColList[2]:Ob[2]},ignore_index=True)

In [8]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [9]:
df.shape

(103, 3)

# Part II: Including Geospatial Coordinates

In [10]:
gc = pd.read_csv('Geospatial_Coordinates.csv')
gc

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [11]:
df = df.join(gc.set_index('Postal Code'),on='Postal Code')
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# Part III: Clustering

Summary of observations: Two main separeted regions were all classified into "Cluster 0." (The red ones)

In [12]:
from sklearn.cluster import KMeans

In [13]:
kclusters = 5
df_onehot = pd.get_dummies(df[['Borough']], prefix="", prefix_sep="")
df_onehot['Neighbourhood'] = df['Neighbourhood']
df_grouped = df_onehot.groupby('Neighbourhood').mean().reset_index()

In [14]:
df_grouped
df_grouped_clustering = df_grouped.drop(['Neighbourhood'],1)

In [15]:
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 3, 3, 3, 4, 1, 0, 0, 4])

In [16]:
df_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
df_copy = df
df_copy = df_copy.join(df_grouped.set_index("Neighbourhood"),on='Neighbourhood')

In [17]:
import folium
latitude = 43.651070
longitude = -79.347015

In [18]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_copy['Latitude'], df_copy['Longitude'], df_copy['Neighbourhood'], df_copy['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters