In [None]:
#!conda install -c conda-forge geocoder
#!conda install -c conda-forge folium=0.5.0 --yes

In [10]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import geocoder
import folium
import numpy as np

In [2]:
#Scrape the list of Canada postal codes using BeautifulSoup

source = requests.get('http://zims-en.kiwix.campusafrica.gos.orange.com/wikipedia_en_all_nopic/A/List_of_postal_codes_of_Canada:_M').text
can_html  = BeautifulSoup(source, 'xml')
can_html = can_html.find('table')

# Three columns of the table: PostalCode, Borough, and Neighborhood
col_names = ['Postalcode', 'Borough', 'Neighborhood']
can_df = pd.DataFrame(columns = col_names)

# Search all the postcode, borough, neighborhood 
for tr_cell in can_html.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        can_df.loc[len(can_df)] = row_data

can_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
#Cleansing the data from not assigned cells

# Clone can_df to df
df = can_df

#  Clean NA cells and cells whose `Borough` is `Not assigned`
df = df.dropna()
df = df[df['Borough'] != 'Not assigned']

# If a cell has a borough but a Not assigned neighborhood, 
# then the neighborhood will be the same as the borough.
not_assigned_ids = df['Neighborhood'] == 'Not assigned'
df['Neighborhood'][not_assigned_ids] = df['Borough'][not_assigned_ids]

# Replace '/' by ','
df['Neighborhood'] = df['Neighborhood'].str.replace(' /', ',')
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [4]:
# Merge cells having the same postal code
temp = df.groupby('Postalcode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp = temp.reset_index(drop=False)
temp.rename(columns={'Neighborhood':'Neighborhood_joined'}, inplace=True)
df_merge = pd.merge(df, temp, on='Postalcode')
df_merge.drop(['Neighborhood'], axis=1, inplace=True)
df_merge.drop_duplicates(inplace=True)
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'}, inplace=True)
df_merge.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
5,M7A,Queen's Park,Queen's Park


In [5]:
df_merge.shape

(103, 3)

In [6]:
#Visualization

geo_df = pd.read_csv('https://cocl.us/Geospatial_data')
geo_df.rename(columns={'Postal Code':'Postalcode'}, inplace=True)
geo_merge = pd.merge(geo_df, df_merge, on='Postalcode')
geo_merge.head(100)
toronto_df = geo_merge[geo_merge['Borough'].str.contains("Toronto")]
toronto_df.reset_index(drop=True, inplace=True)

# Create Toronto map
map_toronto = folium.Map(location=[43.65, -79.38], zoom_start=10)

# Add markers
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=7,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [7]:
#Using KMeans clustering

from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [8]:
k=5
toronto_clustering = toronto_df.drop(['Postalcode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
toronto_df.insert(0, 'Cluster Labels', kmeans.labels_)

In [11]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighborhood'], toronto_df['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters