***Part 1 Begin*** <br>
Import Pandas

In [1]:
import pandas as pd

Read the Table, keep only the first table (As there are many on the page)

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

df = df[0]

Remove rows where the Borough is not assigned and then reset index

In [3]:
df = df[df.Borough != 'Not assigned']
df.reset_index(inplace=True, drop=True)

Display dataframe

In [4]:
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Show shape of dataframe

In [5]:
df.shape

(103, 3)

***Here ends part 1***

***Part 2 Begin*** <br>

Read in the CSV in Pandas

In [6]:
df_latlon = pd.read_csv('https://cocl.us/Geospatial_data')


Merge the two dfs by Postal Code

In [7]:
df=df.merge(df_latlon, left_on='Postal Code', right_on='Postal Code', how='inner')
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


***Here ends part 2***

***Part 3 Begin***
<br> Filter only Toronto Boroughs

In [94]:
toronto_data = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True) 
toronto_data

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


Define user_agent using Nominatim, install Folium then plot a Folium map displaying all Neighborhoods

In [35]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [None]:
!conda install -c conda-forge folium=0.5.0 --yes 


In [119]:
# create map of Manhattan using latitude and longitude values
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

Cluster the Data using K-means and 4 clusters since there are 4 Boroughs in Toronto

In [121]:
toronto_grouped = toronto_data.groupby('Neighborhood').mean().reset_index()


from sklearn.cluster import KMeans
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 1, 3, 1, 1, 1, 3, 1, 2], dtype=int32)

Add Clusters to the Toronto Dataframe

In [122]:
toronto_data.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_data

ValueError: cannot insert Cluster Labels, already exists

Visualize the data on a folium Map

In [123]:
import numpy as np 
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood'], toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Clustering the neighborhoods without any numerical data makes no sense. Perhaps it is best to cluster the neighborhoods around the Boroughs

Create Clusters from Boroughs and insert Cluster labels 1-4 as there are 4 Boroughs

In [128]:
toronto_data = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True) 
 #Redefining toronto_data
clusters = toronto_data.groupby('Borough').mean().reset_index()
cluster_list = [1,2,3,4]
clusters.insert(0, 'Cluster Labels', cluster_list)

clusters



Unnamed: 0,Cluster Labels,Borough,Latitude,Longitude
0,1,Central Toronto,43.70198,-79.398954
1,2,Downtown Toronto,43.654597,-79.383972
2,3,East Toronto,43.669436,-79.324654
3,4,West Toronto,43.652653,-79.44929


Display Clusters onto map

In [129]:

# create map of Manhattan using latitude and longitude values
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(clusters['Latitude'], clusters['Longitude'], clusters['Borough']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map



Set Cluster labels on toronto data dataframe

In [144]:
toronto_data['Cluster Labels'] = toronto_data['Borough'].map(clusters.set_index('Borough')['Cluster Labels'])


In [145]:
toronto_data

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,2
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,3
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,2
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,2
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564,2
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,2
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259,4


In [146]:
import numpy as np 
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

clustervalue = toronto_data['Cluster Labels'].values



# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood'], toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Neighborhoods clustered into Boroughs with corresponding color
<br>
***Part 3 end*** <br>
