In [1]:
##Import Libraries and Start Reading Data 
import pandas as pd 
import numpy as np 
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


In [2]:
from bs4 import BeautifulSoup
import requests

### Import Wikipedia library and pull in data table

In [3]:
import wikipedia as wp
 
#Get the html source
html = wp.page("List_of_postal_codes_of_Canada:_M").html().encode("UTF-8") #set source of html
df = pd.read_html(html,header=0)[0] #read in html to dataframe
df.head() #let's see it


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Clean up dataframe

In [4]:

df = df[~df.Borough.str.contains("Not assigned")]
df = df.groupby(df['Postcode'],as_index=False).aggregate(lambda x: ', '.join(set(x.dropna()))) #group by borough and add columns if multiple neighbourhoods
df['Neighbourhood'].loc[df['Neighbourhood']=='Not assigned'] = df.loc[df['Neighbourhood']=='Not assigned','Borough'].iloc[0] #if neighbourhood not assigned set = to borough
df.columns = ['PostalCode','Borough','Neighborhood']
print (df)

    PostalCode           Borough  \
0          M1B       Scarborough   
1          M1C       Scarborough   
2          M1E       Scarborough   
3          M1G       Scarborough   
4          M1H       Scarborough   
5          M1J       Scarborough   
6          M1K       Scarborough   
7          M1L       Scarborough   
8          M1M       Scarborough   
9          M1N       Scarborough   
10         M1P       Scarborough   
11         M1R       Scarborough   
12         M1S       Scarborough   
13         M1T       Scarborough   
14         M1V       Scarborough   
15         M1W       Scarborough   
16         M1X       Scarborough   
17         M2H        North York   
18         M2J        North York   
19         M2K        North York   
20         M2L        North York   
21         M2M        North York   
22         M2N        North York   
23         M2P        North York   
24         M2R        North York   
25         M3A        North York   
26         M3B        North 

In [5]:
df.shape

(103, 3)

### Read in location via csv locally

In [6]:
path = "C:/Users/alex.hodes/Desktop/Coursera_Capstone";
location = pd.read_csv(path + "/" + 'toronto_coord.csv',encoding="utf-8")
location.head()
location.columns = ['PostalCode','Latitude','Longitude']

In [7]:
df = pd.merge(df,location,on='PostalCode', how='left')
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"West Hill, Morningside, Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Oakridge, Golden Mile, Clairlea",43.711112,-79.284577
8,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest",43.716316,-79.239476
9,M1N,Scarborough,"Cliffside West, Birch Cliff",43.692657,-79.264848


In [8]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library


In [9]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    #label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

In [10]:
df['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

In [11]:
#Choose an amount of clusters for analysis - use cluster count to see if neighborhoods are clustered like borough
kclusters = 11
tor = df
#Cluster only based on latitude and longitude so drop these columns
toronto_cluster = tor.drop(['PostalCode','Borough','Neighborhood'], 1)

#Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_cluster)

#Set Cluster labels as a column for the dataframe 
tor['Cluster'] = kmeans.labels_

In [12]:
#Creat map of Toronto
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor['Latitude'], tor['Longitude'], tor['Neighborhood'], tor['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Compare the clusters vs how the boroughs are divided https://en.wikipedia.org/wiki/Amalgamation_of_Toronto#/media/File:Toronto_map.png