# Scraping Wikipedia for postal codes of Canada

## Here starts part one.

Import necessary modules for web scraping and dataframe generation:

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

Use given URL to scrape Wikipedia and BeautifulSoup to get wanted data from this url:

In [19]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

Find table with postal code data:

In [20]:
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')

For each row in the table delete trailing "\n". Skip the 'Not assigned' rows for the Borough column and if there is a neighbourhood with a 'Not assigned' value, use the value for borough instead. Create a dataframe from this iteration.

In [98]:
l = []
for tr in rows: #iterate through each row
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    r = [p.split('\n')[0] for p in row] #get rid of trainling '\n'
    try:
        if r[1] != 'Not assigned': #skip 'not assigned' boroughs
            if r[2] == 'Not assigned': #for 'not assigned' neighbourhoods use value of borough instead
                r[2] = r[1]
            l.append(r)
    except:
        pass
postal = pd.DataFrame(l, columns=["Postal Code", "Borough", "Neighbourhood"])#create pd.DataFrame
postal = postal.sort_values('Postal Code').reset_index().drop(columns=['index'])#sort by postal code, drop index column
postal

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


Get geospatial data for each postal code area from csv file:

## Here starts part two.

Load csv file with geospatial coordinates:

In [178]:
geocoords = pd.read_csv('Geospatial_Coordinates.csv')

Copy the latitude and longitude data in the postal code dataframe:

In [177]:
postal['Latitude'] = geocoords['Latitude']
postal['Longitude'] = geocoords['Longitude']

Filter dataframe to find data for Toronto only:

In [179]:
tor = [x for x in list(postal['Borough'].drop_duplicates().value_counts().index) if x.find('Toronto') > -1]
df_tor = postal[(postal['Borough'] == tor[0]) | 
                (postal['Borough'] == tor[1]) | 
                (postal['Borough'] == tor[2]) | 
                (postal['Borough'] == tor[3])].sort_values('Borough').reset_index().drop(columns = 'index')
df_tor

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
1,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
3,M4P,Central Toronto,Davisville North,43.712751,-79.390197
4,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
5,M4S,Central Toronto,Davisville,43.704324,-79.38879
6,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
7,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
8,M5N,Central Toronto,Roselawn,43.711695,-79.416936
9,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752


## Here starts part three.

Import modules for fetching geo data, k-means clustering and map plotting: 

In [171]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import numpy as np
import matplotlib.colors as colors

!pip install folium
import folium # map rendering library



Cluster data with KMeans from sklearn, use 4 clusters for the fit since we have four different boroughs in Toronto. Use the folium package for showing the result.

In [180]:
toronto_map = folium.Map(location=[43.67, -79.4], zoom_start=12)

X = df_tor['Latitude']
Y = df_tor['Longitude']
Z = np.stack((X, Y), axis=1)

colors = ['red', 'green', 'blue', 'yellow']
kmeans = KMeans(n_clusters=len(colors), random_state=42).fit(Z)
df_tor['Cluster'] = kmeans.labels_

for latitude, longitude, borough, cluster in zip(df_tor['Latitude'], df_tor['Longitude'], 
                                                 df_tor['Borough'], df_tor['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map