In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [2]:
# import library for request web page
from urllib.request import urlopen
optionsUrl = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
optionsPage = urlopen(optionsUrl)

In [3]:
# import BS library for pulling data out of HTML (wikipedia). load Wikipedia and parse it with Beautiful Soup.
from bs4 import BeautifulSoup
soup = BeautifulSoup(optionsPage)

In [4]:
# Extract all the <table> tags
My_table = soup.find('table',{'class':'wikitable sortable'})

In [10]:
# search for the one with the headings corresponding to the data we want. Then iterate over its rows, pulling out the columns we want and writing the cell text to the file 

table_rows = My_table.find_all('tr')
res = [] 

for tr in table_rows:
        tds = tr.find_all('td')
        row = [td.text.strip() for td in tds if tr.text.strip()]
        if row:
            res.append(row)

# transform the data in the table on the Wikipedia page into pandas dataframe
df = pd.DataFrame(res, columns=["Postalcode", "Borough", "Neighborhood"])
df.head(20)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [11]:
df.shape

(289, 3)

In [12]:
#Ignore cells with a borough that is Not assigned.

df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)
df.reset_index(drop = True, inplace = True)
df.head(15)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [14]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

df.loc[df['Neighborhood']=="Not assigned",'Neighborhood']=df.loc[df['Neighborhood']=="Not assigned",'Borough']
df.reset_index(drop = True, inplace = True)
df.head(20)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [16]:
# More than one neighborhood can exist in one postal code area. Such rows will be combined into one row with the neighborhoods separated with a comma 

df.Neighborhood = df.Neighborhood.astype(str)
toronto_df = df.groupby(['Postalcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
toronto_df.head(20)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [18]:
# use the .shape method to print the number of rows of your dataframe
toronto_df.shape

(103, 3)