## Segmenting and Clustering Neighborhoods in Toronto - Part 3

### Importing relevant files, particularly beautifulsoup and lxlm

In [316]:
import numpy as np    # library to handle data in a vectorized manner
import pandas as pd   # library for data analsysis
import matplotlib.pyplot as plt 
import requests
!easy_install beautifulsoup4    # installing beautifulsoup
!easy_install lxml             # installing parser

Searching for beautifulsoup4
Best match: beautifulsoup4 4.9.3
Processing beautifulsoup4-4.9.3-py3.6.egg
beautifulsoup4 4.9.3 is already the active version in easy-install.pth

Using /home/jupyterlab/conda/envs/python/lib/python3.6/site-packages/beautifulsoup4-4.9.3-py3.6.egg
Processing dependencies for beautifulsoup4
Finished processing dependencies for beautifulsoup4
Searching for lxml
Best match: lxml 4.6.1
Processing lxml-4.6.1-py3.6-linux-x86_64.egg
lxml 4.6.1 is already the active version in easy-install.pth

Using /home/jupyterlab/conda/envs/python/lib/python3.6/site-packages/lxml-4.6.1-py3.6-linux-x86_64.egg
Processing dependencies for lxml
Finished processing dependencies for lxml


### Uploading and organizing the table of data from Wikipedia

In [317]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url)

tdot_loc = soup.find('table',{'class':'wikitable sortable'})
links = tdot_loc.findAll('td')

roughpst = []
bor = []
neigh = []

pst_count = 0
bor_count = 1
neigh_count = 2

for j in links:
    if pst_count <= len(links)-3:
        roughpst.append(links[pst_count])
        pst_count = pst_count + 3
        
    if bor_count <= len(links)-2:
        bor.append(links[bor_count])
        bor_count = bor_count + 3
        
    if neigh_count <= len(links)-1:
        neigh.append(links[neigh_count])
        neigh_count = neigh_count + 3

roughpst = pd.DataFrame(roughpst)
bor= pd.DataFrame(bor)
neigh = pd.DataFrame(neigh)

### Removing \n in each string of the table

In [318]:
post_1 = []
post_2 = []
post_3 = []

for k in range(0,180,1):
    post_1.append(roughpst[0][k].replace('\n',''))
    post_2.append(bor[0][k].replace('\n',''))
    post_3.append(neigh[0][k].replace('\n',''))
    if k == 180:
        post_1.append(roughpst[0][k-1].replace('\n',''))
        post_2.append(bor[0][k-1].replace('\n',''))
        post_3.append(neigh[0][k-1].replace('\n',''))

data1 = {'Postal Code': post_1, 'Borough': post_2, 'Neighbourhood': post_3}

tdot_table = pd.DataFrame(data1)

tdot_table

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Removing rows that postal codes are not assigned to

In [319]:
postal = []
BOR = []
NEI = []

for NA in range(0,len(tdot_table),1):
    if tdot_table['Borough'][NA] != 'Not assigned' :    # if borough is not assigned THEN the neighbourhood column is also not assigned 
        postal.append(tdot_table['Postal Code'][NA])
        BOR.append(tdot_table['Borough'][NA])
        NEI.append(tdot_table['Neighbourhood'][NA])

data2 = {'Postal Code': postal, 'Borough': BOR, 'Neighbourhood': NEI}

real_tdot_table = pd.DataFrame(data2)

### Table of Toronto Postal Codes (with unassigned postal codes removed)

In [320]:
real_tdot_table

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### Uploading and Displaying Geographical coordinates of each postal code

In [321]:
geospatial_coor = pd.read_csv("http://cocl.us/Geospatial_data")
geospatial_coor

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


### Linking latitude and longitude to Toronto table

In [322]:
p2_postal_code = []
p2_borough = []
p2_neighbourhood = []
p2_latitude = []
p2_longitude = []

i = 0   # counter for current Toronto table 
j = 0   # counter for geospatial table

while i != len(real_tdot_table):
    if real_tdot_table['Postal Code'][i] == geospatial_coor['Postal Code'][j] :
        p2_postal_code.append(real_tdot_table['Postal Code'][i])
        p2_borough.append(real_tdot_table['Borough'][i])
        p2_neighbourhood.append(real_tdot_table['Neighbourhood'][i])
        p2_latitude.append(geospatial_coor['Latitude'][j])
        p2_longitude.append(geospatial_coor['Longitude'][j])
        j = 0
        i = i + 1
   
    j = j + 1
    
    if j == 103:
        j = 0
        
data3 = {'Postal Code': p2_postal_code, 'Borough': p2_borough, 'Neighbourhood':  p2_neighbourhood, 
         'Latitude' : p2_latitude, 'Longitude' : p2_longitude}

p2_tdot_table = pd.DataFrame(data3)

### Toronto Table with Latitude and Longitude

In [323]:
p2_tdot_table

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Map of Greater Toronto Area

In [324]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# creating a map of Toronto and surrounding area using latitude and longitude values
map_tdot = folium.Map(location=[tdot_clus['Latitude'][0], tdot_clus['Longitude'][0]], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(p2_tdot_table['Latitude'], p2_tdot_table['Longitude'], p2_tdot_table['Borough'], p2_tdot_table['Neighbourhood']):
    label = '{}, {}'.format(p2_tdot_table, p2_tdot_table['Borough'][0])
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color='blue',fill=True, fill_color='#3186cc', fill_opacity=0.7,parse_html=False).add_to(map_tdot)  

map_tdot

### Map of Greater Toronto Area with k-Means Clustering applied to the map

In [325]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# run k-means clustering
kclusters = 5
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tdot_clus)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
tdot_merge = p2_tdot_table
tdot_merge.insert(0, 'Cluster Labels', kmeans.labels_)  # comment out if run more than once

# create map
map_tdot_clusters = folium.Map(location=[tdot_clus['Latitude'][0], tdot_clus['Longitude'][0]], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tdot_merge['Latitude'], tdot_merge['Longitude'], tdot_merge['Neighbourhood'], tdot_merge['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_tdot_clusters)      

map_tdot_clusters