# Neighborhoods of Toronto

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen
import requests
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## Get the Postal Code Data
1. Read the webpage 
2. Use Beautiful Soup to parse the HTML
3. Pull the table data
4. Display the data to be sure we have what we need

In [2]:

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
soup=BeautifulSoup(source,'lxml')
from IPython.display import display_html
tab = str(soup.table)
display_html(tab,raw=True)

Postal Code,Borough,Neighbourhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M8A,Not assigned,Not assigned
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"


## Create a Pandas dataframe

In [3]:
dfs = pd.read_html(tab)
df =dfs[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Remove items that do not have a Borough

In [4]:
df = df[df.Borough!='Not assigned']
df.head()    

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Aggregate like Postal Codes

In [5]:
dfg = df.groupby(['Postal Code','Borough'],sort=False).agg(', '.join)
dfg.reset_index(inplace=True)
dfg

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### If there is no assigned neighbourhood, copy the borough to its place.
### Set Pandas so that all rows are shown to confirm everything is working as it should be.


In [6]:
dfg['Neighbourhood']=np.where(dfg['Neighbourhood']=='Not assigned',dfg['Borough'],dfg['Neighbourhood'])
pd.set_option('display.max_rows',dfg.shape[0]+1)
dfg.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
dfg.shape # To be sure nothing has changed record wise

(103, 3)

In [18]:
#!pip install geocoder
#import geocoder # import geocoder

# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
 # g = geocoder.google('1600 Amphitheatre Pkwy, Mountain View, CA')
 # print(g)
 # lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

### The code above was tried but geocoder was a no go even using a fixed address that is used in the package examples.
### Must use the CSV file link provided in the submission doc as seen below.

In [19]:
urlNH = 'https://cocl.us/Geospatial_data'

dfLatLon = pd.read_csv(urlNH)
dfLatLon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
dfMerge = pd.merge(dfg, dfLatLon,on='Postal Code')
dfMerge.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Lets only look at bouroughs that contain 'Toronto'

In [21]:
dfT = dfMerge[dfMerge['Borough'].str.contains('Toronto',regex=False)]
dfT

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [22]:

import folium
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

## Why Google Toronto Lat and Long when we can code it.

In [23]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="CA_Explorer")
location = geolocator.geocode(address)
tlatitude = location.latitude
tlongitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(tlatitude, tlongitude))

The geograpical coordinate of Manhattan are 43.6534817, -79.3839347.


## Use a For loop and Folium to to put our neighborhoods on the map

In [24]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[tlatitude, tlongitude], zoom_start=10)

# add markers to map
for lat, lng, label in zip(dfT['Latitude'], dfT['Longitude'], dfT['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Identify clusters using kmeans dropping non numeric columns and adding a cluster column

In [25]:
kclusters = 5

toronto_grouped_clustering = dfT.drop(['Neighbourhood','Postal Code','Borough'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_
dfT.insert(0,'Cluster Labels',kmeans.labels_)

## Use Folium to map Toronto with the lat and long used earlier

In [17]:
map_clusters = folium.Map(location=[tlatitude, tlongitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dfT['Latitude'], dfT['Longitude'], dfT['Neighbourhood'], dfT['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters