In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import geocoder
import folium
import numpy as np

In [2]:
#Scrape the list of Canada postal codes using BeautifulSoup

source = requests.get('http://zims-en.kiwix.campusafrica.gos.orange.com/wikipedia_en_all_nopic/A/List_of_postal_codes_of_Canada:_M').text
can_html  = BeautifulSoup(source, 'xml')
can_html = can_html.find('table')

# Three columns of the table: PostalCode, Borough, and Neighborhood
col_names = ['Postalcode', 'Borough', 'Neighborhood']
can_df = pd.DataFrame(columns = col_names)

# Search all the postcode, borough, neighborhood 
for tr_cell in can_html.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        can_df.loc[len(can_df)] = row_data

In [4]:
#Cleansing the data from not assigned cells

# Clone can_df to df
df = can_df

#  Clean NA cells and cells whose `Borough` is `Not assigned`
df = df.dropna()
df = df[df['Borough'] != 'Not assigned']

# If a cell has a borough but a Not assigned neighborhood, 
# then the neighborhood will be the same as the borough.
not_assigned_ids = df['Neighborhood'] == 'Not assigned'
df['Neighborhood'][not_assigned_ids] = df['Borough'][not_assigned_ids]

# Replace '/' by ','
df['Neighborhood'] = df['Neighborhood'].str.replace(' /', ',')

In [5]:
# Merge cells having the same postal code
temp = df.groupby('Postalcode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp = temp.reset_index(drop=False)
temp.rename(columns={'Neighborhood':'Neighborhood_joined'}, inplace=True)
df_merge = pd.merge(df, temp, on='Postalcode')
df_merge.drop(['Neighborhood'], axis=1, inplace=True)
df_merge.drop_duplicates(inplace=True)
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'}, inplace=True)
df_merge.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
5,M7A,Queen's Park,Queen's Park
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,"Rouge, Malvern"
9,M3B,North York,Don Mills North
10,M4B,East York,"Woodbine Gardens, Parkview Hill"
12,M5B,Downtown Toronto,"Ryerson, Garden District"


In [6]:
df_merge.shape

(103, 3)

In [7]:
#Visualization

geo_df = pd.read_csv('https://cocl.us/Geospatial_data')
geo_df.rename(columns={'Postal Code':'Postalcode'}, inplace=True)
geo_merge = pd.merge(geo_df, df_merge, on='Postalcode')
geo_merge.head(100)
toronto_df = geo_merge[geo_merge['Borough'].str.contains("Toronto")]
toronto_df.reset_index(drop=True, inplace=True)

# Create Toronto map
map_toronto = folium.Map(location=[43.65, -79.38], zoom_start=10)

# Add markers
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=7,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

toronto_df.head()

Unnamed: 0,Postalcode,Latitude,Longitude,Borough,Neighborhood
0,M4E,43.676357,-79.293031,East Toronto,The Beaches
1,M4K,43.679557,-79.352188,East Toronto,"The Danforth West, Riverdale"
2,M4L,43.668999,-79.315572,East Toronto,"The Beaches West, India Bazaar"
3,M4M,43.659526,-79.340923,East Toronto,Studio District
4,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park


In [8]:
#map will not be displayed on github
map_toronto