In [1]:
!pip install beautifulsoup4



### Read Toronto neiborhood table from website using BeautifulSoup

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(URL)

#Read html 
soup = BeautifulSoup(page.content, 'html.parser')

#parse and convert to table
table = soup.find_all('table')
df_nbrhood = pd.read_html(str(table))[0]


In [3]:
#adjust the column names
df_nbrhood.columns=['PostalCode','Borough','Neighborhood']

print(df_nbrhood.shape)

(180, 3)


### Data wrangling

#### Exclude rows where Borough='Not assigned'

In [4]:
df_nbrhood=df_nbrhood[df_nbrhood['Borough']!='Not assigned']

#### Check if 'PostalCode' values are unique 

In [5]:
df_nbrhood.shape

(103, 3)

In [6]:
len(df_nbrhood['PostalCode'].unique())

103

The above result shows PostalCode values are unique, as count of unique postal codes is equal to the total count of postal codes.

#### Check if there exist 'Not assigned' 'Neighborhood

In [7]:
df_nbrhood[df_nbrhood['Neighborhood']=='Not assigned'] 

Unnamed: 0,PostalCode,Borough,Neighborhood


The above result shows there is no cell where  Neighborhood = 'Not assigned'.

In [8]:
# Reset the index
df_nbrhood. reset_index(drop=True,inplace=True)

### The cleaned data

In [9]:
df_nbrhood.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [10]:
df_nbrhood.shape

(103, 3)