# Installing and importing the necessary libraries

In [1]:
! pip install pandas
import pandas as pd
! pip install lxml
! pip install html5lib
! pip install beautifulsoup4
print("Libraries installed")

Libraries installed


# Obtaining the data from Wikipedia

In [2]:
from pandas.io.html import read_html
page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

wikitables = read_html(page, attrs={"class":"wikitable"})
print("Exracted {num} wikitables".format(num=len(wikitables)))
wikitables[0].shape

Exracted 1 wikitables


(288, 3)

# Converting the Wikitable into a Pandas DataFrame

In [3]:
wiki = wikitables[0]
df = pd.DataFrame(wiki)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# Ignoring the cells with a borough that is "Not assigned"

In [4]:
new_df = df[df.Borough != 'Not assigned']
new_df.shape

(211, 3)

# Combining rows with the same Postcode separating the Neigburhoods with a comma

In [13]:
grouped = new_df.groupby(['Postcode','Borough'], sort=False)['Neighbourhood'].apply(', '.join).reset_index()
grouped[0:11]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


# Replacing all "Not assigned" neighborhoods with the name of the Borough

In [14]:
grouped.loc[(grouped["Neighbourhood"] == "Not assigned"), 'Neighbourhood'] = grouped['Borough']
grouped[0:11]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


# What is the shape of the dataframe?

In [7]:
grouped.shape

(103, 3)

# Importing the Geospatial_data

In [8]:
import pandas as pd
geo_data = pd.read_csv('/Users/kirilyunakov/Downloads/Geospatial_Coordinates.csv')
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Renaming the column based on which will perform the merge

In [9]:
renamed_geo_data = geo_data.rename(columns={"Postal Code": "Postcode"})
renamed_geo_data.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Merging the Geospatial_data with the data from Wikipedia based on the Postal Code

In [12]:
result = pd.merge(grouped, renamed_geo_data, on='Postcode', how='left')
result[0:11]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


# Explore boroughs that contain the word Toronto 