# Part 1

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

Retrieve the webpage for editing.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')

Make of 2-dimensional list of the table (rows, columns) discluding the header column.

In [3]:
trs = soup.find('table', class_='wikitable').select('tr')
table = [tr.get_text().split('\n')[1:4] for tr in trs][1:]

Convert the table to a dataframe and add column names.

In [4]:
df = pd.DataFrame(table, columns=['PostalCode', 'Borough',
                  'Neighborhood']).replace('\n', '', regex=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


Ignore cells with a borough that is **Not assigned**.

In [5]:
df = df[df.Borough != 'Not assigned']

If a cell has a borough but a **Not assigned** neighborhood, then the neighborhood will be the same as the borough.

In [6]:
df.loc[df.Neighborhood == 'Not assigned', 'Neighborhood'] = df.Borough

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


More than one neighborhood can exist in one postal code area. These two rows will be combined into one row with the neighborhoods separated with a comma. Also keep the original sort order.

In [7]:
df = df.groupby('PostalCode', as_index=False,
                sort=False).agg({'Borough': 'first',
                                'Neighborhood': ', '.join})
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


Print the number of rows of the dataframe.

In [8]:
df.shape

(103, 3)

# Part 2

Read the data file provided to a data frame, then merge this with original using the postal code as reference. Then remove the identical column from the coordinates file.

In [9]:
coords = pd.read_csv('https://cocl.us/Geospatial_data')
df = df.join(coords.set_index('Postal Code'), on='PostalCode', sort=False)
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


# Part 3

Import neccessary libraries.

In [10]:
!pip install geopy folium
from geopy.geocoders import Nominatim
import folium



We are required to explore and cluster the neighborhoods in Toronto. Thus we work with only boroughs that contain the word Toronto.

In [11]:
toronto = df[df.Borough.str.contains('Toronto')]
toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
31,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
36,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752


Generate a map to visualize the neighborhoods and how they cluster together.

In [12]:
address = 'Toronto'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

t_map = folium.Map(location = [latitude, longitude], zoom_start = 12)
for lat, lng, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(t_map)

display(t_map)