In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import numpy as np

### To scrape the data of the neighborhoods in Toronto from Wikipedia into dataframe type

In [2]:
column_name = ['PostalCode', 'Borough', 'Neighborhood']
neighborhoods = pd.DataFrame(columns = column_name)

In [3]:
url_str = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url = urlopen(url_str)
soup = BeautifulSoup(url, 'html.parser')

In [4]:
table = soup.find('table', attrs = {'class': 'wikitable sortable'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [c.text.strip() for c in cols]
    
    if len(cols) > 1:
        neighborhoods = neighborhoods.append({'PostalCode': cols[0], 'Borough' : cols[1], 'Neighborhood': cols[2]}, ignore_index= True)


In [5]:
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Ignore the rows with a borough that is "not assigned"

In [6]:
neighborhoods = neighborhoods[neighborhoods['Borough'] != 'Not assigned']

In [7]:
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### merge the values of "Neighborhood" based on the "PostalCode" and "Borough"

In [8]:
neighborhoods = neighborhoods.groupby(['PostalCode', 'Borough']).agg({'Neighborhood' : ','.join}).reset_index()

### If a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough

In [9]:
neighborhoods['Neighborhood'] = np.where(neighborhoods['Neighborhood'] == 'Not assigned', neighborhoods['Borough'], neighborhoods['Neighborhood'])

In [11]:
neighborhoods.shape

(103, 3)