## 1. Scraping using pandas

In [52]:
import pandas as pd

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df=pd.read_html(url, header=0)[0]

In [53]:
df.shape

(287, 3)

In [54]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## 2. Scraping using BeautifulSoup

In [68]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
text = requests.get(url).text
soup = BeautifulSoup(text,'xml')

table = soup.find('table',{'class':'wikitable sortable'})
rows = table.find_all('tr')

data = []
for row in rows:
    data.append([t.text.strip() for t in row.find_all('td')])

data = filter(None, data)
df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])

In [69]:
df.shape

(287, 3)

In [70]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## 3. Prepare the DataFrame

### Filter out rows with Boroughs which have "Not assigned" values

In [71]:
# ignore rows with Boroughs which are "Not assigned" from the dataframe
df = df[df['Borough'] != "Not assigned"]

df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [72]:
df.shape

(210, 3)

### Combine multiple neighborhoods in a PostalCode as comma separated in a single row

In [73]:
# combine multiple neighborhoods in a PostalCode as comma separated in a single row
df = df.groupby("PostalCode", as_index = False).agg(lambda x: ", ".join(sorted(set(x))))

In [74]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [75]:
df.shape

(103, 3)

### Change "Not assigned" neighborhood values to be the same as Borough

In [81]:
# All Boroughs in the DataFrame have valid names, i.e no "Not assigned".
# Now, if a borough has a "Not assigned" neighborhood, then make the neighborhood value same as the borough

df.loc[df['Neighbourhood'] == "Not assigned", 'Neighbourhood'] = df['Borough']

In [82]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [80]:
df.shape

(103, 3)