### import the libraries we need

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Get the website

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
source.status_code

200

### Scraping the table from the website and put it into a dataframe

In [4]:
soup = BeautifulSoup(source.content,'lxml')
type(soup)

bs4.BeautifulSoup

In [5]:
table = soup.find_all('table')[0]

In [6]:
df = pd.read_html(str(table))

In [7]:
dataframe = df[0]

In [8]:
dataframe.columns.values 

array([0, 1, 2])

In [9]:
dataframe.columns = dataframe.iloc[0]
dataframe.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [10]:
dataframe = dataframe.reindex(dataframe.index.drop(0))


### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned

In [11]:
dataframe = dataframe[dataframe.Borough != 'Not assigned']

In [12]:
print(dataframe.shape)
dataframe.head()

(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [13]:
dataframe = dataframe.reset_index(drop=True)
print(dataframe.shape)
dataframe.head()

(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### Combine the rows that has the same postal code, and put the neighbours in the same cell. 

In [14]:
#a = dataframe['Neighbourhood'].groupby(dataframe['Postcode']).apply(lambda x: "{%s}" %', '.join(x))
a = dataframe['Neighbourhood'].groupby([dataframe['Postcode'],dataframe['Borough']]).apply(lambda x: ', '.join(x))
a.head()


Postcode  Borough    
M1B       Scarborough                            Rouge, Malvern
M1C       Scarborough    Highland Creek, Rouge Hill, Port Union
M1E       Scarborough         Guildwood, Morningside, West Hill
M1G       Scarborough                                    Woburn
M1H       Scarborough                                 Cedarbrae
Name: Neighbourhood, dtype: object

In [18]:
df = pd.DataFrame(a).reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### If a cell has borough but a Not Assigned neighborhood, then the neighborhood will be the same as the borough

In [51]:
a = df[df.Neighbourhood == 'Not assigned'].index
a

Int64Index([85], dtype='int64')

In [55]:
df.loc[a,'Neighbourhood'] = df.loc[a,'Borough']

### print the number of rows of your dataframe.

In [60]:
df.shape

(103, 3)