### Import libraries

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [4]:
# Load data from wiki url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
data = BeautifulSoup(source, 'lxml')

In [5]:
# Create DataFrame
cols = ['Postalcode', 'Borough', 'Neighbourhood']
toronto_df = pd.DataFrame(columns = cols)

In [6]:
# For loop to find content in the columns
content = data.find('div', class_='mw-parser-output')
table = content.table.tbody

postalcode = 0
borough = 0
neighbourhood = 0

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postalcode = td.text
            i += 1
        elif i == 1:
            borough = td.text
            i += 1
        elif i == 2:
            neighbourhood = td.text.strip('\n').replace(']', '')
    toronto_df = toronto_df.append({'Postalcode': postalcode, 'Borough': borough, 'Neighbourhood': neighbourhood}, ignore_index=True)

toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,0,0,0
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [7]:
# DataFrame Cleaning
toronto_df = toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df = toronto_df[toronto_df.Borough != 0]

toronto_df.reset_index(drop=True, inplace=True)

# If Neighbourhood content is 'Not assigned' means it is same as the content in Borough
i = 0
for i in range(0, toronto_df.shape[0]):
    if toronto_df.iloc[i][2] == 'Not assigned':
        toronto_df.iloc[i][2] = toronto_df.iloc[i][1]
        i += 1

df = toronto_df.groupby(['Postalcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
# Drop None value and replace 'Not assigned' with 'NaN'
df = df.dropna()
df = df[(df.Postalcode != 'Not assigned') & (df.Borough != 'Not assigned') & (df.Neighbourhood != 'Not assigned')]

df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
def neighbourhood_list(groupedDataframe):
    return ', '.join(sorted(groupedDataframe['Neighbourhood'].tolist()))

grouped_df = df.groupby(['Postalcode', 'Borough'])
new_df = grouped_df.apply(neighbourhood_list).reset_index(name='Neighbourhood')

new_df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
type(new_df)

pandas.core.frame.DataFrame