# Study of Toronto Boroughs

## Set dependencies

In [None]:
# IBM Data Science
# https://github.com/learningsam20/IBMDataScience
# https://www.coursera.org/learn/applied-data-science-capstone/peer/I1bDq/segmenting-and-clustering-neighborhoods-in-toronto
#
from bs4 import BeautifulSoup
import geocoder
import pandas as pd
import urllib3
import requests
from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [None]:
! pip install beautifulsoup4
! pip install geocoder
! pip install pandas
! pip install urllib3
! pip install requests
! pip install geopy
! pip install folium
! pip install sklearn
! pip install numpy
! pip install matplotlib

# Generate dataframe for processing

The notebook will in general follow this procedure
* Scrape the webpage from Wiki
* Transform the unstructured data into dataframe with relevant attributes
* Data transformation to clean, merge the neghborhoods
* Get the geo-coordinates for the boroughs
* Create clusters based on the borough geo-coordinates
* Visualize the clusters on the geographical map

## Scrape data from wiki page

In [None]:
# set defaults for pandas dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# parse the wiki
wikiurl = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
http = urllib3.PoolManager()
html = http.request('GET', wikiurl).data.decode('utf-8')

## Parse DOM to populate the dataframe

In [None]:
# parse the DOM using beautiful soup
soup = BeautifulSoup(html)
# find table and get all rows within the table
datatable=soup.find("table",attrs={"class":"wikitable"}).findAll("tr")
# create the dataframe for storing parsed data
df=pd.DataFrame(columns=("PostalCode","Borough","Neighborhood"))
alldata=[]
for r in datatable:
    d=r.findAll("td")
    v={}
    # only include rows having Borough not equal to not assigned
    if len(d)>0 and d[1].text.strip() !="Not assigned":
        v["PostalCode"]=d[0].text.strip()
        v["Borough"]=d[1].text.strip()
        # copy Borough to Neighborhood if it is not assigned ot empty
        v["Neighborhood"]=d[1].text.strip() if d[2].text.strip()=="Not assigned" else d[2].text.strip()
        alldata.append(v)
df=pd.DataFrame(alldata)
print("dataframe loaded with " + str(len(df)) + " records")

## Merge multiple neighborhoods for same borough

In [None]:
# group by the PostalCode and Borough and combined Neighborhood
dfg=pd.DataFrame(df.groupby(["PostalCode","Borough"],as_index=False).apply(lambda x: "%s" % ', '.join(x.Neighborhood)).reset_index())
dfg.columns=["PostalCode","Borough","Neighborhood"]
print(dfg.head())
#df.reset_index(True)
print(dfg.shape)
#print(dfg.loc[dfg['PostalCode'] == "M5A"])
#print(dfg.loc[dfg['PostalCode'] == "M9V"])

## Get the geo coordinates for various postal codes

In [None]:
# load geocoder
geocsvurl="https://cocl.us/Geospatial_data"
dfc=pd.read_csv(geocsvurl)
print(dfc.head())

## Merge geo coordinates with original dataframe

In [None]:
# append latitude, longitude to the existing dataframe
dfm=pd.merge(dfg,dfc,left_on="PostalCode",right_on="Postal Code",how="left").drop("Postal Code",axis=1)
print(dfm.head())

## Subset the data frame for boroughs having "Toronto"

In [None]:
# Get central address
address = 'Toronto, Canada'

# include only those boroughs that contain word Torronto
dft=dfm[dfm["Borough"].str.contains("Toronto")]

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

## Plot the geography map to visualize the boroughs

In [None]:
#create map
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(dft['Latitude'], dft['Longitude'], dft['Borough'], dft['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Cluster the neighborhoods

In [None]:
# clustering
kclusters = 5 if len(dft["Borough"].unique()) < 5 else len(dft["Borough"].unique()) 
toronto_grouped_clustering = dft.copy(deep=True).drop(["Neighborhood","Borough","PostalCode"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# save cluster labels generated for each row in the dataframe
dft["Cluster"] = kmeans.labels_

## Plot the established clusters on map and visualize

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2+200 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dft['Latitude'], dft['Longitude'], dft['Neighborhood'], dft['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        #color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters