# Explore and cluster the neighborhoods in Toronto:

In [2]:
# import required libraries 
import pandas
import numpy as np
import requests
from bs4 import BeautifulSoup

# import data 
website_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_text,'xml')

table = soup.find('table',{'class':'wikitable sortable'})
table_rows = table.find_all('tr')

data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pandas.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])

# to filter out bad rows
df = df[~df['PostalCode'].isnull()]  

df.head(11)

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"


In [3]:
# replace "Not assigned" to NaN
df.replace("Not assigned", np.nan, inplace = True)

# drop whole row with NaN
df.dropna(subset=["Borough", "Neighbourhood"], axis=0, inplace=True)

# reset index
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [4]:
# combined similar PostalCode 
df= df.groupby('PostalCode').agg(lambda x: ','.join(x))

#combined similar Borough
df.loc[df['Neighbourhood']=="Not assigned",'Neighbourhood']=df.loc[df['Neighbourhood']=="Not assigned",'Borough']

# remove duplicate Borough
df['Borough']= df['Borough'].str.replace('nan|[{}\s]','').str.split(',').apply(set).str.join(',').str.strip(',').str.replace(",{2,}",",")

# reset index
df = df.reset_index()

df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
df.shape

(103, 3)

In [6]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 8.7MB/s ta 0:00:011
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [8]:
import geocoder
def get_latilong(postal_code):
    lati_long_coords = None
    while(lati_long_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lati_long_coords = g.latlng
    return lati_long_coords
    
get_latilong('M4G')

[43.70909000000006, -79.36409999999995]

In [12]:
# Retrieving Postal Code Co-ordinates
postal_codes = df['PostalCode']    
coords = [ get_latilong(postal_code) for postal_code in postal_codes.tolist() ]

In [20]:
# Adding Columns Latitude & Longitude
import pandas as pd
df1 = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df1['Latitude']
df['Longitude'] = df1['Longitude']

In [22]:
df[df.PostalCode == 'M5G']

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
57,M5G,DowntownToronto,Central Bay Street,43.65609,-79.38493


In [23]:
df.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.78564,-79.15871
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76575,-79.1752
3,M1G,Scarborough,Woburn,43.7682,-79.21761
4,M1H,Scarborough,Cedarbrae,43.76969,-79.23944
5,M1J,Scarborough,Scarborough Village,43.74309,-79.23526
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.72861,-79.26367
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.71406,-79.28412
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.7236,-79.23496
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.69539,-79.26194


In [29]:
import pandas as pd
df1 = pd.read_csv('Geospatial_Coordinates.csv')
df2 = 
df1.head()

Unnamed: 0.1,Unnamed: 0,PostalCode,Borough,Neighbourhood
0,0,M1B,Scarborough,"Malvern, Rouge"
1,1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,3,M1G,Scarborough,Woburn
4,4,M1H,Scarborough,Cedarbrae


In [27]:
# Get the latitude and the longitude coordinates of each neighborhood
df1 = df.to_csv('Geospatial_Coordinates.csv')
df1

In [28]:
import csv

with open('Geospatial_Coordinates.csv') as csvfile:
     reader = csv.DictReader(csvfile)
     for row in reader:
         print(row['PostalCode'],row['Borough'], row['Neighbourhood'])

M1B Scarborough Malvern, Rouge
M1C Scarborough Rouge Hill, Port Union, Highland Creek
M1E Scarborough Guildwood, Morningside, West Hill
M1G Scarborough Woburn
M1H Scarborough Cedarbrae
M1J Scarborough Scarborough Village
M1K Scarborough Kennedy Park, Ionview, East Birchmount Park
M1L Scarborough Golden Mile, Clairlea, Oakridge
M1M Scarborough Cliffside, Cliffcrest, Scarborough Village West
M1N Scarborough Birch Cliff, Cliffside West
M1P Scarborough Dorset Park, Wexford Heights, Scarborough Town Centre
M1R Scarborough Wexford, Maryvale
M1S Scarborough Agincourt
M1T Scarborough Clarks Corners, Tam O'Shanter, Sullivan
M1V Scarborough Milliken, Agincourt North, Steeles East, L'Amoreaux East
M1W Scarborough Steeles West, L'Amoreaux West
M1X Scarborough Upper Rouge
M2H NorthYork Hillcrest Village
M2J NorthYork Fairview, Henry Farm, Oriole
M2K NorthYork Bayview Village
M2L NorthYork York Mills, Silver Hills
M2M NorthYork Willowdale, Newtonbrook
M2N NorthYork Willowdale, Willowdale East
M2P No

In [52]:
from  geopy.geocoders import Nominatim
geolocator = Nominatim()
location = geolocator.geocode("M1G, Scarborough, Woburn")
print(location.address)

print((location.latitude, location.longitude))

print(location.raw)

  from ipykernel import kernelapp as app


Woburn, Scarborough—Guildwood, Scarborough, Toronto, Golden Horseshoe, Ontario, M1H 2A2, Canada
(43.7598243, -79.2252908)
{'place_id': 4761941, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'node', 'osm_id': 558715617, 'boundingbox': ['43.7498243', '43.7698243', '-79.2352908', '-79.2152908'], 'lat': '43.7598243', 'lon': '-79.2252908', 'display_name': 'Woburn, Scarborough—Guildwood, Scarborough, Toronto, Golden Horseshoe, Ontario, M1H 2A2, Canada', 'class': 'place', 'type': 'neighbourhood', 'importance': 0.5588036674790013}
