In [179]:
import numpy as np 

import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

import json

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from bs4 import BeautifulSoup # library to parse HTML and XML documents

print('Libraries imported!')

Libraries imported!


In [149]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wiki_response = requests.get(wiki_url).text

## Load and Explore

In [158]:
soup = BeautifulSoup(wiki_response, 'lxml')
# To keep the code compact we comment out the soup.prettify() result
#print(soup.prettify())

In [177]:
table = soup.find_all('table')[0] 
neighborhoods = pd.read_html(str(table))[0]
neighborhoods.head()


Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [178]:
print(neighborhoods[1].value_counts())

Not assigned        77
Etobicoke           45
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                9 
East Toronto        7 
East York           6 
Queen's Park        1 
Borough             1 
Mississauga         1 
Name: 1, dtype: int64


## Remove rows where Borough and Neighborhood columns have 'Not assigned' values

In [167]:
neighborhoods = neighborhoods[neighborhoods[1]!='Not assigned']

# Reset the index
neighborhoods.reset_index(drop=True, inplace=True)
neighborhoods.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M5A,Downtown Toronto,Regent Park


## Combine rows where more than one neighborhood exists in one postal code area

In [170]:
# Group dataframe and combine rows where more than one neighborhood exists into one comma separated row.
neighborhoods = neighborhoods.groupby([0, 1])[2].apply(lambda x: ','.join(x)).reset_index()
neighborhoods.head(12)

Unnamed: 0,0,1,2
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


## Rows where Neighborhood column has 'Not assigned' value and replace it with corresponding Borough value.¶

In [171]:
borough_data = neighborhoods[1]
neighborhoods_data = neighborhoods[2]

# Loop through the data
for ind, (borough_address, neighborhoods_address) in enumerate(zip(borough_data, neighborhoods_data)):
    if neighborhoods_address.strip() == "Not assigned":
        neighborhoods_data[ind] = borough_data[ind]

# Check that borough named 'Queen's Park' has the same named neighbourhood
print(pd.DataFrame(neighborhoods.iloc[83:88,]))

      0             1                                                  2
83  M6R  West Toronto  Parkdale,Roncesvalles                            
84  M6S  West Toronto  Runnymede,Swansea                                
85  M7A  Queen's Park  Queen's Park                                     
86  M7R  Mississauga   Canada Post Gateway Processing Centre            
87  M7Y  East Toronto  Business Reply Mail Processing Centre 969 Eastern


In [172]:
print(neighborhoods.shape)

(104, 3)
