In [10]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 4000)

## Scrape table from Webpage

In [5]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipage= requests.get(url).text

# Parse html/xml codes from webpage.
soup = BeautifulSoup(wikipage,'xml')

# get the table in wikipage
table = soup.find('table')

listPostcode = []
listBorough = []
listNeighbourhood = []

for row in table.find_all('tr'):      
    cells = row.find_all('td')
    if cells:   # if cells has elements
        Postcode_var = cells[0].find(text = True)
        Borough_var = cells[1].find(text = True)
        Neighbourhood_var = cells[2].find(text = True).strip()    # .strip() to remove new line character '\n' at end of text.
    else:
        continue    # skip to next row if no elements
    

    # skip to next row if Borough = 'Not assigned'    
    if Borough_var == 'Not assigned': 
        continue
    
    # if Neighbourhood = 'Not assigned', then Neighborhood will be the same as Borough   
    if Neighbourhood_var == 'Not assigned': 
        Neighbourhood_var = Borough_var

    
    listPostcode.append(Postcode_var)
    listBorough.append(Borough_var)
    listNeighbourhood.append(Neighbourhood_var)

## Combine Neighbourhoods that share same Postcode

In [7]:
listUniqPostcode = set(listPostcode)
print(f'Number of unique Postcode: {len(listUniqPostcode)}')
print(f'Number of all Postcode: {len(listPostcode)}')
listNewPostcode = []
listNewBorough = []
listNewNeighbourhood = []


for postcode in listUniqPostcode:
    p_var = ''; b_var = ''; n_var = ''; 
    for idx, item in enumerate(listPostcode):
        if item == postcode:
            p_var = item;
            b_var = listBorough[idx]
            if n_var == '':    # if Neighbourhood hasn't got value due to new postcode
                n_var = listNeighbourhood[idx]
            else:     # if Neighbourhood already has value due to same postcode
                n_var = n_var + ', ' + listNeighbourhood[idx]
                
    listNewPostcode.append(p_var)
    listNewBorough.append(b_var)
    listNewNeighbourhood.append(n_var)

Number of unique Postcode: 103
Number of all Postcode: 210


## Create dataframe

In [12]:
dict = {'Postcode':listNewPostcode, 'Borough':listNewBorough, 'Neighbourhood':listNewNeighbourhood}
df = pd.DataFrame.from_dict(dict)
df.to_csv('toronto_part1.csv')
print(df.head(10))
print()
print(df.shape)

  Postcode           Borough                                      Neighbourhood
0      M9M        North York                                   Emery, Humberlea
1      M4M      East Toronto                                    Studio District
2      M5B  Downtown Toronto                           Ryerson, Garden District
3      M1J       Scarborough                                Scarborough Village
4      M6S      West Toronto                                 Runnymede, Swansea
5      M4N   Central Toronto                                      Lawrence Park
6      M4Y  Downtown Toronto                               Church and Wellesley
7      M1H       Scarborough                                          Cedarbrae
8      M2J        North York                       Fairview, Henry Farm, Oriole
9      M9R         Etobicoke  Kingsview Village, Martin Grove Gardens, Richv...

(103, 3)
