In [81]:
# import required libraries

import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes
import requests # 'requests' offers the most friendly API for opening files, including JSON support

In [82]:
# import the BeautifulSoup library so we can parse HTML and XML documents

from bs4 import BeautifulSoup

# specify which URL/web page we are going to be scraping

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# open the url and put the html into the soup variable

response = requests.get(url)
html = response.text
soup = BeautifulSoup(html,'html.parser')

In [83]:
all_table=soup.find('table', class_='wikitable sortable')

A=[]
B=[]
C=[]

for row in all_table.findAll('tr'):
    
    cells=row.findAll('td')
    
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

**The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood**

In [84]:
df = pd.DataFrame(A,columns=['PostalCode'])
df['Borough']=B
df['Neighbourhood']=C

# had to clean the data by removing \n that has appeared at the end of each text string in each column ... don't know why?!

df['PostalCode'] = df['PostalCode'].str.rstrip('\n')
df['Borough'] = df['Borough'].str.rstrip('\n')
df['Neighbourhood'] = df['Neighbourhood'].str.rstrip('\n')
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


**Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.**

In [85]:
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


**Combine rows so that each row has a unique postal code.  Seperate multiple neighbourhoods with a comma**

In [86]:
df.groupby('PostalCode').agg({'Borough':'first','Neighbourhood': ', '.join}).reset_index()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


**If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.**

In [87]:
# print(df.loc[df['Neighbourhood'] == 'Not assigned']) ... this shows that there are no 'Not assigned' values in Neighborhood

# But in case there were, this function replaces the neighbourhood not assigned value with the bourough value

df['Neighbourhood']=df['Borough'].where(df['Neighbourhood'].eq('Not assigned'),df['Neighbourhood'])
df.head()


Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [88]:
# Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.

# In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

df.shape

(103, 3)