In [1]:
import pandas as pd

When we observe the shape of the dataframe, there are currently 211 rows and 3 columns

In [2]:
df = pd.read_csv('wiki_scrape.csv',) 
df = df[df.Borough != "Not assigned"]

print(df.head())
print(df.shape)

  PostalCode           Borough      Neighborhood
2        M3A        North York         Parkwoods
3        M4A        North York  Victoria Village
4        M5A  Downtown Toronto      Harbourfront
5        M5A  Downtown Toronto       Regent Park
6        M6A        North York  Lawrence Heights
(211, 3)


1. The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
2. Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
3. More than one neighborhood can exist in one postal code area. 
4. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [3]:
df2 = df.groupby(by=['PostalCode','Borough']).agg(lambda x: ','.join(x))

df2.reset_index(level=['PostalCode','Borough'], inplace=True)

df2.loc[df2['Neighborhood'] == ('Not assigned'), 'Neighborhood'] = df2['Borough']

After the data cleaning/aggregating, we now have 103 rows in the dataset

In [4]:
df2.shape

(103, 3)

Using the link to a csv file with the given coordinates for each neighboord, we obtain this dataframe

In [5]:
Geospatial_Coordinates_df = pd.read_csv('Geospatial_Coordinates.csv')
Geospatial_Coordinates_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


With the geospatial coordinates and the wiki dataset, we then merge the two with commmon postal codes.

In [6]:
df2=df2.join(Geospatial_Coordinates_df, lsuffix='_caller', rsuffix='_other')
df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [7]:
df2.drop("Postal Code", axis = 1);

In [8]:
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium

In [10]:
from geopy.geocoders import Nominatim

address = 'TORONTO,CANADA'

geolocator = Nominatim(user_agent = "python3")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [11]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto