# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto #
## Part One:  Creating the data table ##

In [5]:
import numpy as np # library for vectorized computation

import lxml
import pandas as pd
from bs4 import BeautifulSoup
import requests
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 



In [6]:
### laod postal code table from Wikipedia
myURL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

req = requests.get(myURL)
soup = BeautifulSoup(req.content, 'html.parser')
myTable = soup.find_all('table')[0]
  

new_table = [] 
    
row_marker = 0
for row in myTable.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) > 1:
           new_table.append((cols[0].text.strip(),cols[1].text.strip(),cols[2].text.strip()))
           

df = pd.DataFrame(new_table)
df.columns = ['Postal Code', 'Borough', 'Neighborhood']
df.head()


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
# Remove entries with no unassigned boroughs

df = df[~df.Borough.str.contains("Not assigned")]
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Part Two:  Geocoding ##

In [10]:
# Get lat/long lookup by postal code
url="http://cocl.us/Geospatial_data"
c=pd.read_csv(url)
postal_codes = pd.merge(df,c,on='Postal Code')

postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [11]:
postal_codes.shape

(103, 5)

## Part Three:  Cluster by Lat / Long##


In [12]:
### Get lat/long of Toronto
address = 'Toronto, CA'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [14]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    ------------------------------------------------------------
                       

In [15]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, code, neighborhood in zip(postal_codes['Latitude'], postal_codes['Longitude'], postal_codes['Postal Code'], postal_codes['Neighborhood']):
    label = '{}, {}'.format(code,neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='white',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [16]:
# get lat long points
x1 = postal_codes['Longitude']
x2 = postal_codes['Latitude']

# define random centres
dx1 = x1.max() - x1.min()
dx2 = x2.max() - x2.min()
#print(x1.min(),x1.max(),dx1, x2.min(),x2.max(), dx2)

import random
C1 = [x1.min() + dx1 * random.random(), x2.min() + dx2 * random.random()]
C2 = [x1.min() + dx1 * random.random(), x2.min() + dx2 * random.random()]
C3 = [x1.min() + dx1 * random.random(), x2.min() + dx2 * random.random()]

centers = [C1,C2,C3]
print(centers)

[[-79.39030900845809, 43.76932288045774], [-79.56756794382268, 43.70310203187367], [-79.24594978825783, 43.6977505629062]]


In [17]:
# update centers funcion
def update_centers(x1, x2, class_of_points):
    center1 = [np.mean(np.array(x1)[~class_of_points]), np.mean(np.array(x2)[~class_of_points])]
    center2 = [np.mean(np.array(x1)[class_of_points]), np.mean(np.array(x2)[class_of_points])]
    return [center1, center2]



In [20]:
# assign to colors function

colors_map = np.array(['blue', 'red', 'green'])
def assign_members(x1, x2, centers):
    compare_to_first_center = np.sqrt(np.square(np.array(x1) - centers[0][0]) + np.square(np.array(x2) - centers[0][1]))
    compare_to_second_center = np.sqrt(np.square(np.array(x1) - centers[1][0]) + np.square(np.array(x2) - centers[1][1]))

    class_of_points = compare_to_first_center > compare_to_second_center
    colors = colors_map[class_of_points + 1 - 1]
    return colors, class_of_points

In [21]:
for i in range(5):
    colors, class_of_points = assign_members(x1, x2, centers)
    centers = update_centers(x1, x2, class_of_points)
    print(i, centers)


0 [[-79.35624521282051, 43.71230553717948], [-79.52478493199999, 43.680590587999994]]
1 [[-79.34763186388889, 43.71205906527778], [-79.51216953870967, 43.68730141612904]]
2 [[-79.34512919285714, 43.712529674285705], [-79.50750625454545, 43.687803618181825]]
3 [[-79.34009514242423, 43.713380163636366], [-79.49893163513514, 43.68895961621622]]
4 [[-79.34009514242423, 43.713380163636366], [-79.49893163513514, 43.68895961621622]]


In [22]:
### join color assignments to original table
postal_codes['color'] = colors
postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,color
0,M3A,North York,Parkwoods,43.753259,-79.329656,blue
1,M4A,North York,Victoria Village,43.725882,-79.315572,blue
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,blue
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,red
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,blue


In [24]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, code, neighborhood, color in zip(postal_codes['Latitude'], postal_codes['Longitude'], postal_codes['Postal Code'], postal_codes['Neighborhood'], postal_codes['color']):
    label = '{}, {}'.format(code,neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto