##### Extracting and cleaning the data

<p> Installing and importing necessary libraries. </p>

In [1]:
!pip install BeautifulSoup4
import requests
from bs4 import BeautifulSoup  # To work with a HTML page
import pandas as pd
import numpy as np



Extracting content from a given url and stroring it using BeutifulfulSoup

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL) 
# print(r.content) 
soup = BeautifulSoup(r.content, 'html5lib') 
# print(soup.prettify()) 

Dictionary to store the content of a page so that it can be directly converted to Dataframe for an easy manipulation.

In [3]:
from collections import defaultdict
dic = defaultdict(list)

Extracting data from a table tag with help of BeautifulSoup and storing it in a dictionary

In [4]:
table = soup.table
rows = table.find_all('tr')
for r in rows:
    cols = r.find_all('td')
#     row = [ dict[]=i.text[:-1] for i in cols]
    for i in range(len(cols)):
        if i == 0:
            dic['Postal Code'].append(cols[i].text[:-1])
        if i == 1:
            dic['Borough'].append(cols[i].text[:-1])
        if i == 2:
            dic['Neighborhood'].append(cols[i].text[:-1])
# dic          

Converting Dictionary to Dataframe.

In [5]:
data = pd.DataFrame.from_dict(dic)
data

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


Extracting Boroughs which are 'Not assigned'

In [6]:
to_remove = data[data['Borough'] == 'Not assigned'].index

Dropping the Boroughs which are 'Not assigned'

In [7]:
data.drop(to_remove, inplace = True)
data

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Resetting index.

In [8]:
data.reset_index(inplace = True)
data.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [None]:
Dropping the unnecessary index column.

In [9]:
data.drop('index', axis = 1)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Shape of the final DataFrame.

In [10]:
data.shape

(103, 4)