# NoteBook for Scraping Wikipedia page with Postal Codes

<p> Importing the necessary libraries, Using BeautifulSoup to scrape the wikipedia page <p>

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

#### Getting the web page using requests module

In [2]:
page=requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

#### Using BeautifulSoup to parse the page we requested

In [3]:
soup=BeautifulSoup(page.content,'html.parser')

#### Finding all the rows in the page using find_all method

In [4]:
trows=soup.find_all('tr')

In [5]:
#Empty lists to store the cells of each row
postal_code=[]
borough=[]
Neighbourhood=[]

<p> Looping over each row and reading each cell and then adding them to the corresponding list. Limiting the rows to 289 since that's where the postal code data table ends<p>

In [6]:
for tr in trows[1:289]:
    td=tr.find_all('td')
    postal_code.append(td[0].string)
    borough.append(td[1].string)
    Neighbourhood.append(td[2].get_text().strip("\n"))

#### Converting the data scraped into a DataFrame

In [9]:
df_pc=pd.DataFrame(list(zip(postal_code,borough,Neighbourhood)),columns=['PostalCode','Borough','Neighbourhood'])

In [10]:
df_pc.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Removing the Boroughs that are having value as Not assigned

In [11]:
df_pc['Borough'].replace('Not assigned',np.nan,inplace=True)
df_pc.dropna(axis=0,inplace=True)
df_pc

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


#### Replacing Not assigned Neighbourhoods with the Borough

In [12]:
df_pc.loc[df_pc.Neighbourhood=='Not assigned','Neighbourhood']=df_pc.loc[df_pc.Neighbourhood=='Not assigned','Borough'].item()

In [13]:
df_pc.loc[8]

PostalCode                M7A
Borough          Queen's Park
Neighbourhood    Queen's Park
Name: 8, dtype: object

#### Combining the Neighbourhoods that are from same Postal Code

In [14]:
pc_clean=df_pc.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(','.join).reset_index()

In [15]:
pc_clean.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [16]:
pc_clean.shape

(103, 3)