# Data obtaining

In [1]:
import numpy as np 
import pandas as pd 
import requests
from bs4 import BeautifulSoup

In [2]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(response, 'lxml')

# Data transforming

In [3]:
def parse_html_table(table):
        n_columns = 0
        n_rows=0
        column_names = []
        for row in table.find_all('tr'):
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    n_columns = len(td_tags)
                        
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())
    
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")
    
        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                              index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1
                    
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass
            
        return df

In [4]:
for table in soup.find_all('table', class_="wikitable sortable"):
    df = parse_html_table(table)                      

In [5]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


# Data cleaning

In [6]:
df = df[df.Borough != 'Not assigned']
df = df.replace('\n',' ', regex=True)
df= df[df['Neighbourhood\n'] != 'Not assigned']
df = df.groupby(['Postcode','Borough'])['Neighbourhood\n'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df.sample(frac=1).reset_index(drop=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M9L,North York,Humber Summit
1,M5J,Downtown Toronto,"Harbourfront East , Toronto Islands , Union St..."
2,M9M,North York,"Emery , Humberlea"
3,M3H,North York,"Bathurst Manor , Downsview North , Wilson Heig..."
4,M3K,North York,"CFB Toronto , Downsview East"
5,M2R,North York,Willowdale West
6,M5N,Central Toronto,Roselawn
7,M5A,Downtown Toronto,"Harbourfront , Regent Park"
8,M5S,Downtown Toronto,"Harbord , University of Toronto"
9,M9A,Etobicoke,Islington Avenue


In [7]:
df.shape

(103, 3)