In [257]:
# Packages required for parsing xml and base manipulations
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# Packages required for mapping
import folium as fol
from geopy.geocoders import Nominatim

# Uses BeautifulSoup to scrap the url variable and separate the object 'table'
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
s = requests.get(url).text
soup = BeautifulSoup(s, 'xml')
table = soup.find('table')

# Sets up the base of the dataframe df
data = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])
df = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])

# Parses the object 'table' into the dataframe 'df'
for tr in table.find_all('tr'):
    row = []
    for td in tr.find_all('td'):
        row.append(td.text.strip())
    if len(row)==3:
        data.loc[len(data)] = row
        
# Drops any rows where the Borough is 'Not assigned'
data = data[data['Borough'] != 'Not assigned']

# Replaces the Neighborhood name with the Borough when Neighborhood is 'Not assigned'
i = 0
while i < len(df):
    if df.iloc[i,2] == 'Not assigned':
        df.iloc[i,2] = df.iloc[i,1]
    i += 1

# Collapses Neighborhoods per PostalCode down to the single record
df = pd.DataFrame(data.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: "%s" % ',  '.join(x)))
df.rename(columns={'Neighborhood':'Neighborhoods'}, inplace=True)
df = df.reset_index()

# Reads the geospatial coordinates in the dataframe 'geo' and preps for joining
geo = pd.read_csv('Geospatial_Coordinates.csv')
geo.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

# Merges the geospatial coordinates from the dataframe 'geo' with the base dataframe 'df'
df = pd.merge(df, geo, on='PostalCode')
data = pd.merge(data, geo, on='PostalCode')

# Sets the base location for the map to be centered around
address = "Toronto, Canada"
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# Returns the latitude and longitude of the base location
#print("The geographical coordinates of ",address," are: (",latitude,", ",longitude,")")

# Imports the required packages to cluster the neighborhoods
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Preps the dataframe for clustering by dropping categorical values
#trs_df.drop(columns=['Borough','Neighborhoods'], inplace=True)

# Trains the dataset to cluster the values in the dataframe 'trs_df'
X = trs_df.values[:,[1,2]]
X = np.nan_to_num(X)
clus_dataset = StandardScaler().fit_transform(X)

# Clusters the Postal Codes into a number set by 'clus_num' and stores values in the 'labels object'
clus_num = 4
k_means = KMeans(init="k-means++", n_clusters=clus_num, n_init=10)
k_means.fit(X)
labels = k_means.labels_

# Adds the KMeans output of 'labels' to the dataframe 'trs_df' as 'ClusterNumber'
trs_df["ClusterNumber"] = labels

# Returns the center of each cluster
#trs_df.groupby("ClusterNumber").mean()

# Merges the K-Means clusters from the dataframe 'trs_df' to the dataframe 'data'
data = pd.merge(data, trs_df, on='PostalCode')
data.drop(columns=['Latitude_y','Longitude_y'], inplace=True)
data.rename(columns={'Latitude_x':'Latitude','Longitude_x':'Longitude'}, inplace=True)

# Reinitializes a folium map object called 'tor_map' centered around the location 'address'
tor_map = fol.Map(location=[latitude,longitude], zoom_start=12)

# Sets the foundation for plotting clusters
clusters = data['ClusterNumber'].unique().tolist()
y = 0
cls = ['blue','green','red','yellow', 'purple', 'orange', 'black', 'pink', 'gray', 'beige', 'white']

# Plots each Cluster Number from the object 'clusters' to the folium object 'tor_map'
for i in clusters:
    temp = data[data['ClusterNumber'] == i]

    for lat, lng, cluster_num, postal_code, borough, neighborhood in zip(temp['Latitude'], temp['Longitude'], temp['ClusterNumber'], temp['PostalCode'], temp['Borough'], temp['Neighborhood']):
        label = '{}, {}, {}, {}'.format(cluster_num, postal_code, borough, neighborhood)
        label = fol.Popup(label, parse_html=True)
        fol.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=cls[y],
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(tor_map)
    
    y += 1

tor_map

### This above cell parses the data and loads it in the necessary dataframes
<ul>
    <li>data - contains the parsed output from the base url variable</li>
    <li>df - the collapsed view containing postal codes with a list of neighborhoods</li>
    <li>geo - the file containing the longitude and latitude for postal codes</li>
    <li>trs_df - the dataframe containing just boroughs related to Toronto</li>
</ul>

### It then runs the trs_df through the K-Means clustering model to output the clusters called ClusterNumber. The number of clusters used in the above model was 4 -- in an attempt to replicate the existing borough values.

### The next cell plots the postal codes on a folium object but uses the true Borough values to color the ring in an attempt to compare the model run previously.

In [270]:
# Reinitializes a folium map object called 'tor_map' centered around the location 'address'
tor_map = fol.Map(location=[latitude,longitude], zoom_start=12)

# Plots the Toronto Postal Codes onto the folium object 'tor_map'
y=0

for i in trs:
    temp = data[data['Borough'] == i]

    for lat, lng, postal_code, borough, neighborhood in zip(temp['Latitude'], temp['Longitude'], temp['PostalCode'], temp['Borough'], temp['Neighborhood']):
        label = '{}, {}, {}'.format(postal_code, borough, neighborhood)
        label = fol.Popup(label, parse_html=True)
        fol.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=cls[y],
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(tor_map)
    
    y += 1

tor_map

### The next cell allows you to explore how changing the number of clusters impacts the map layer.

### I feel a good balance is when the variable 'clus_num' is set to 6. This shows distinct groups of boroughs, and geographically speaking, there may be distinct cultures between them (i.e. near the heart of downtown, right off the highway, a little removed up north, etc.)

In [271]:
# Clusters the Postal Codes into a number set by 'clus_num' and stores values in the 'labels object'
clus_num = 6
k_means = KMeans(init="k-means++", n_clusters=clus_num, n_init=10)
k_means.fit(X)
labels = k_means.labels_

# Adds the KMeans output of 'labels' to the dataframe 'trs_df' as 'ClusterNumber'
trs_df["ClusterNumber"] = labels

# Returns the center of each cluster
#trs_df.groupby("ClusterNumber").mean()

# Merges the K-Means clusters from the dataframe 'trs_df' to the dataframe 'data'
data = pd.merge(data, trs_df, on='PostalCode')
data.drop(columns=['Latitude_y','Longitude_y','ClusterNumber_x'], inplace=True)
data.rename(columns={'Latitude_x':'Latitude','Longitude_x':'Longitude','ClusterNumber_y':'ClusterNumber'}, inplace=True)

# Reinitializes a folium map object called 'tor_map' centered around the location 'address'
tor_map = fol.Map(location=[latitude,longitude], zoom_start=12)

# Plots each Cluster Number from the object 'clusters' to the folium object 'tor_map'
y = 0
clusters = data['ClusterNumber'].unique().tolist()
for i in clusters:
    temp = data[data['ClusterNumber'] == i]

    for lat, lng, cluster_num, postal_code, borough, neighborhood in zip(temp['Latitude'], temp['Longitude'], temp['ClusterNumber'], temp['PostalCode'], temp['Borough'], temp['Neighborhood']):
        label = '{}, {}, {}, {}'.format(cluster_num, postal_code, borough, neighborhood)
        label = fol.Popup(label, parse_html=True)
        fol.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=cls[y],
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(tor_map)
    
    y += 1

tor_map