In [118]:
#!conda install -y html5lib

In [119]:
import lxml
import html5lib
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Scrape the Data Table
This section scrapes the data from wikipedia.
Beautiful Soup is used to extract the table structure, which is then fed to Pandas to create the dataframe

In [120]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
request = requests.get(url)
soup = BeautifulSoup(request.content, 'html.parser')
tables = soup.find_all('table')
df = pd.read_html(str(tables[0]), header=0)


In [121]:
df[0].head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# Clean the data
- Ignore cells with borough "not assigned"
- This is done by creating a boolean selector and creating a new dataframe using the inverse

In [122]:
na_rows = df[0].Borough == "Not assigned"
f0 = df[0][~na_rows]


- If Neighbourhood is not assigned then take the borough


In [123]:
pd.set_option('mode.chained_assignment', None)
nb_rows = f0.Neighbourhood == "Not assigned"
f0.loc[nb_rows, 'Neighbourhood'] = f0.loc[nb_rows, 'Borough']
pd.set_option('mode.chained_assignment', 'warn')

- Combined common postcodes. One row with neighbourhoods comma separated

First we use the grouby by function to group rows with the same postcode.
We create a blank dataframe, with the same columns, ready to receive the grouped data

In [124]:
grouped = f0.groupby('Postcode')
ndf = pd.DataFrame(columns=f0.columns)


We loop over each group, extracting the postcode, the Borough, and joining the Neighbourhood names.
Then we append to our new dataframe.


In [125]:
for postcode, group in grouped:
    g = {}
    g['Postcode'] = postcode
    g['Borough'] = group.Borough.iloc[0]
    g['Neighbourhood'] = ",".join(group['Neighbourhood'].values.tolist())
    ndf = ndf.append(g, ignore_index=True)


Finally we print the shape of the new dataframe

In [126]:
print(ndf.shape)


(103, 3)


# Part Two - Merge the Long Lat data
Read tghe longitude and lattitude for each postcode held in the CSV as a new data frame

In [127]:
ll_df = pd.read_csv("./Geospatial_Coordinates.csv")
print(ll_df.head())


  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


To combine the tables, we need to do a *join*.

Because the column names are not quite the same, we have to specify the join columns separately for left and right

This creates a new dataframe

In [128]:
j = ndf.merge(ll_df,left_on='Postcode', right_on='Postal Code')

In [129]:
final = j.drop('Postal Code', axis=1)
print(final.head(10))

  Postcode      Borough                                  Neighbourhood  \
0      M1B  Scarborough                                  Rouge,Malvern   
1      M1C  Scarborough           Highland Creek,Rouge Hill,Port Union   
2      M1E  Scarborough                Guildwood,Morningside,West Hill   
3      M1G  Scarborough                                         Woburn   
4      M1H  Scarborough                                      Cedarbrae   
5      M1J  Scarborough                            Scarborough Village   
6      M1K  Scarborough      East Birchmount Park,Ionview,Kennedy Park   
7      M1L  Scarborough                  Clairlea,Golden Mile,Oakridge   
8      M1M  Scarborough  Cliffcrest,Cliffside,Scarborough Village West   
9      M1N  Scarborough                     Birch Cliff,Cliffside West   

    Latitude  Longitude  
0  43.806686 -79.194353  
1  43.784535 -79.160497  
2  43.763573 -79.188711  
3  43.770992 -79.216917  
4  43.773136 -79.239476  
5  43.744734 -79.239476  
6  

# Clustering
Use KMeans to group the table entries according to the longitude and latitude and plot using folium

### Import Libraries
First we import the libraries we will need

In [130]:
from sklearn.cluster import KMeans
import folium  # map rendering library
import matplotlib.cm as cm
import matplotlib.colors as colors

### Clustering
Take a copy of the dataframe and drop columns except for long/lat, then run clustering

In [131]:
# Cluster by Long / Lat
kclusters = 5
k = final.drop(['Borough', 'Neighbourhood','Postcode'], axis=1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(k)


### Merge
Merge the clustering results into the final dataset by creating a new column

In [132]:
# Add the cluster as a column
final.insert(0, 'Cluster', kmeans.labels_)
#print(final.head(20))

Now create a Folium map using the centroid (mean) of the data set as the centre point:

In [133]:
centroid = final.mean(axis=0)
map_clusters = folium.Map(location=[centroid.Latitude, centroid.Longitude], zoom_start=11)


Select some colours for the markers:

In [134]:


# Manual Colour Selection
colors_array = cm.rainbow([0.1,0.2,0.7,0.8,0.9])
rainbow = [colors.rgb2hex(i) for i in colors_array]

Add markers to the map:

In [135]:
for lat, lon, poi, cluster in zip(final['Latitude'], final['Longitude'],
                                  final['Neighbourhood'], final['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster - 1],
        fill=True,
        fill_color=rainbow[cluster - 1],
        fill_opacity=0.7).add_to(map_clusters)
    

Show the map:

In [136]:
map_clusters