# 1. Data Collection

In [204]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [205]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [206]:
panel = soup.find('div', class_='mw-parser-output')

# 2. Data Formation

## 2.1 Data Preprocess

In [207]:
# Set up empty lists to save the results
PostalCodes_list = []
Boroughs_list = []
Neigborhoods_list = []
#Scan segment with 'p' in the the table
for Block in panel.find('table').find_all('p'):
    #Set up empty temporary variables for data preprocessing
    PostalCode = ''
    Borough = ''
    Neigborhoods = ''
    Neigborhood = ''
    #Fetch the Postal Code
    PostalCode = Block.b.text
    #Fetch all the cities with the same Postal Code
    Cities = Block.span.text
    try:
        #Extract the first city that is the Borough
        Borough = Cities.split("(")[0].strip()
        #Extract the Neigborhoods near their Borough
        if Borough != 'Not assigned':
            #print(Borough)
            Neigborhoods = Cities.split("(")[1].split(")")[0].strip().split('/')
            Neigborhoods = [string.strip() for string in Neigborhoods]
            Neigborhood = ", ".join(Neigborhoods)
            # Appending the Postal Code, Borough and Neigborhood in their lists
            PostalCodes_list.append(PostalCode)
            Boroughs_list.append(Borough)
            Neigborhoods_list.append(Neigborhood)
    except Exception as e:
        pass

## 2.2 Dataframe Creation

In [208]:
dict = {'PostalCode': PostalCodes_list, 'Borough': Boroughs_list, 'Neigborhoods': Neigborhoods_list}
#print(len(PostalCodes_list))
#print(len(Boroughs_list))
#print(len(Neigborhoods_list))
#print(PostalCodes_list[-1])
#print(Boroughs_list[-1])
#print(Neigborhoods_list[-1])
df = pd.DataFrame(dict)

# 3. Locations

## 3.1 Download locations

In [209]:
!wget -O Geospatial_Coordinates.csv http://cocl.us/Geospatial_data

--2020-03-18 15:52:51--  http://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 158.85.108.83, 158.85.108.86, 169.48.113.194
Connecting to cocl.us (cocl.us)|158.85.108.83|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2020-03-18 15:52:51--  https://cocl.us/Geospatial_data
Connecting to cocl.us (cocl.us)|158.85.108.83|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-03-18 15:52:52--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.29.197
Connecting to ibm.box.com (ibm.box.com)|107.152.29.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-03-18 15:52:52--  https://ibm.box.com/public

In [210]:
df_locations = pd.read_csv('Geospatial_Coordinates.csv')

In [211]:
df_locations.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
df_locations.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## 3.2 Combining dataframes

In [212]:
df.head()

Unnamed: 0,PostalCode,Borough,Neigborhoods
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M9A,Etobicoke,Islington Avenue


In [214]:
df_merge = df.set_index('PostalCode').join(df_locations.set_index('PostalCode'))
df_merge.head()

Unnamed: 0_level_0,Borough,Neigborhoods,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
