<h1>Segmenting and Clustering Neighborhoods in Toronto<h1>

In [1]:
#import required libraries
import pandas as pd
import numpy as np

In [2]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [3]:
df_master = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [4]:
df_master[0]

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


In [5]:
#collect the required table columns
df_table = pd.DataFrame(df_master[0], columns=['Postal code','Borough','Neighborhood']).dropna(axis=0)
df_table.shape

(103, 3)

In [6]:
df_table.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [7]:
#remove non-valid rows
df = df_table.loc[df_table['Borough'] != 'Not assigned']
df.shape

(103, 3)

In [10]:
#copy Borough data when neighborhood data is missing
for index,row in df.iterrows():
    if(row.Neighborhood == 'Not assigned'):
        row.Neighborhood = row.Borough

In [16]:
#Merge same postcode
df_merge = df.groupby(['Postal code','Borough'])['Neighborhood'].apply(lambda x:  ','.join(x)).reset_index()

In [17]:
df_merge.shape

(103, 3)

__Analyze Data__

In [14]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.2 MB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.21.0-py_0

The following packages will be UPDATED:

  openssl                                 1.1.1f-h516909a_0 --> 1.1.1g-h51

In [25]:
#method to get Lat and Lng of Postal Code
def getLatLng(postal_code):
    address = '{}, Toronto, Ontario'.format(postal_code)
    
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        geolocator = Nominatim(user_agent="ny_explorer")
        lat_lng_coords = geolocator.geocode(address)
    
    latitude = lat_lng_coords.latitude
    longitude = lat_lng_coords.longitude
    
    return latitude,longitude

In [None]:
#fetch lat,lng for each postal code
lat_list = []
lng_list = []
for code in df_merge['Postal code']:
    lat,lng= getLatLng(code)
    lat_list.append(lat)
    lng_list.append(lng)

print("No. of latitudes:", len(lat_list), "No. of longitude:", len(lng_list))

__Note: Postal code is incomplete to form the exact address for fetching latitude and longitude
Thus, using shared csv to fetch the data__

In [29]:
# Read lat, lng csv
df_ll = pd.read_csv('https://cocl.us/Geospatial_data', index_col = 'Postal Code')

In [30]:
df_ll.shape

(103, 2)

In [32]:
# Add lat, lng data to the dataframe
df_merge['Latitude'] = ""
df_merge['Longitude'] = ""

for index, row in df_merge.iterrows():
    rw = df_ll.loc[row['Postal code']]
    row['Latitude'] = rw.Latitude
    row['Longitude'] = rw.Longitude

In [33]:
df_merge.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.8067,-79.1944
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.7845,-79.1605
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395


__Exploring__

In [52]:
#Neighborhood with toronto name in their borough
df_t = df_merge[df_merge['Borough'].str.contains('Toronto')]
df_t.shape

(39, 5)

In [47]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [50]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_t['Latitude'], df_t['Longitude'], df_t['Borough'], df_t['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto