# This notebook is created for the Coursera Capstone Project

This code has been written to scrape the wikipedia page with postal codes of Canada. I didn't need BeautifulSoup since I used pandas read_html.

After scraping the page and constructing the dataframe some manipulations were performed to only process cells with an assigned borough. 

Rows with the same borough were combined with comma separated value in the column Neighborhood.

For the rows with empty Neighborhood the value of Borough was used in the column Neighborhood.

In [3]:
import pandas as pd
import numpy as np
from pypostalcode import PostalCodeDatabase
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [6]:
# EXERCISE PART 1: Creating the dataframe and transforming the data
# -----------------------------------------------------------------

d = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

df = d[0]
df.columns = ['PostalCode', 'Borough', 'Neighborhood']

# Drop rows where Borough is "Not assgined"
df = df.replace('Not assigned', np.nan)
df = df.dropna(subset=['Borough'])

# Group by PostalCode
df = df.groupby('PostalCode', as_index=False).agg(lambda x: ', '.join(set(x.dropna())))

# Where the neighborhood is emtpy, use borough instead
def fx(x):
    if (x['Neighborhood']):
        return x['Neighborhood']
    else:
        return x['Borough']
df['Neighborhood'] = df.apply(lambda x : fx(x),axis=1)

print("EXERCISE PART 1:")
print(df.head())
print(df.shape)

# EXERCISE PART 2: Adding latitude & longitude to the dataframe
# -------------------------------------------------------------

# Function to search for latitude based on postal code
def searchlatitude(x):
    #print(x)
    pcdb = PostalCodeDatabase()
    try:
        location = pcdb[x]
        #print(x, ", ", location.latitude)
        return location.latitude
    except:
        return "Not found"

# Function to search for longitude based on postal code
def searchlongitude(x):
    #print(x)
    pcdb = PostalCodeDatabase()
    try:
        location = pcdb[x]
        return location.longitude
    except:
        return "Not found"

# Add columns Latitude and Longitude
df['Latitude'] = df.apply(lambda row: searchlatitude(row.PostalCode), axis = 1)
df['Longitude'] = df.apply(lambda row: searchlongitude(row.PostalCode), axis = 1)

# Drop the rows for which the postal code was not found
df = df.replace('Not found', np.nan)
df = df.dropna(subset=['Latitude'])

print("EXERCISE PART 2:")
print(df.head())
print(df.shape)

# EXERCISE PART 3: Exploring & clustering the neighborhoods of Toronto
# --------------------------------------------------------------------

# Create a boolean mask to filter rows where Borough contains "Toronto" and create new dataframe based on mask
boroughtoronto = df['Borough'].str.contains("Toronto")
neighborhoods = df[boroughtoronto]

# Get location of Toronto
address = 'Toronto'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
#print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# Create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'],
                                           neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

# Show map (in Jupyter Notebook)
map_toronto


EXERCISE PART 1:
  PostalCode      Borough                            Neighborhood
0        M1B  Scarborough                          Malvern, Rouge
1        M1C  Scarborough  Rouge Hill, Highland Creek, Port Union
2        M1E  Scarborough       Morningside, West Hill, Guildwood
3        M1G  Scarborough                                  Woburn
4        M1H  Scarborough                               Cedarbrae
(103, 3)
EXERCISE PART 2:
  PostalCode      Borough                            Neighborhood  Latitude  \
0        M1B  Scarborough                          Malvern, Rouge   43.7976   
1        M1C  Scarborough  Rouge Hill, Highland Creek, Port Union   43.7882   
2        M1E  Scarborough       Morningside, West Hill, Guildwood   43.7385   
3        M1G  Scarborough                                  Woburn   43.7563   
4        M1H  Scarborough                               Cedarbrae   43.7563   

   Longitude  
0   -79.2270  
1   -79.1911  
2   -79.2021  
3   -79.2224  
4   -79.241