# Segmenting and Clustering Neighborhood in Toronto

## Part 1: Transforming data from Wikipedia to a pandas dataframe

In [2]:
#Install and import the necessary libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import json
from pandas.io.json import json_normalize
!conda install -c anaconda requests --yes
import requests
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
!conda install -c conda-forge beautifulsoup4
from bs4 import BeautifulSoup
!conda install -c conda-forge folium=0.5.0 --yes
print('Libraries loaded!')

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - requests


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _libgcc_mutex-0.1          |             main           3 KB  anaconda
    _py-xgboost-mutex-2.0      |            cpu_0           9 KB  anaconda
    _pytorch_select-0.1        |            cpu_0           2 KB  anaconda
    absl-py-0.10.0             |           py37_0         169 KB  anaconda
    aiohttp-3.6.3       

ModuleNotFoundError: No module named 'folium'

In [4]:
# Obtaining Wikipedia data using get request and BeautifulSoup4:
wiki_data= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
wiki_soup= BeautifulSoup(wiki_data, 'lxml')

In [5]:
#Defining the dataframe columns
New_columns=['Postal Code', 'Borough', 'Neighborhood']

#Creating a new dataframe
df=pd.DataFrame(columns=New_columns)
df

Unnamed: 0,Postal Code,Borough,Neighborhood


In [6]:
#Filling the dataframe
wiki_table=wiki_soup.find('table')
wiki_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park, Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor, Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park, Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue, Humber Valley Village
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern, Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3B
</td>
<td

In [7]:
# Looping through the table and filling the dataframe one row at a time
for tr in wiki_table.find_all('tr'):
    new_row=[]
    for td in tr.find_all('td'):
        new_row.append(td.text.strip())
    if len(new_row)==3:
            df.loc[len(df)]=new_row
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
# Processing cells that have assigned borough
df_new= df[df.Borough!='Not assigned']
df_new.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
# Creating the resultant dataframe
df_new = df_new.groupby('Postal Code').agg({'Borough':'first','Neighborhood': ', '.join}).reset_index()
df_new

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [10]:
#Printing the number of rows and columns respectively
df_new.shape

(103, 3)

## Part 2: Getting the coordinates of each Neighborhood

In [11]:
#Creating the dataframe coordinates from the data
df_coor=pd.read_csv('http://cocl.us/Geospatial_data')
df_coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
#Combining neighborhood dataset with coordinates dataset
df_new=pd.merge(df_new, df_coor, on='Postal Code')
df_new.head(103)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


## Part 3: Clustering the neighborhoods in Toronto

In [13]:
#Narrowing datasets limited to Toronto
df_to=df_new[df_new['Borough']=='Downtown Toronto'].reset_index(drop=True)
df_to.head(20)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752


In [3]:
!pip install folium

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 5.0 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [14]:
# Obtaining geographical coordinate of Toronto
import folium
address='Toronto'
geolocator=Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print("The latitude and longitude coordinates for Toronto is:", (latitude,longitude))

The latitude and longitude coordinates for Toronto is: (43.6534817, -79.3839347)


In [15]:
# Creating a map of Toronto with its downtown neighborhood using coordinates values 
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# Adding markers to map_Toronto
for lat, lng, borough, Neighborhood in zip(df_to['Latitude'], df_to['Longitude'], df_to['Borough'], df_to['Neighborhood']):
    label = '{},{}'.format(Neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=7,
        popup=label,
        color='teal',
        fill=True,
        fill_color='#3186cf',
        fill_opacity=0.9,
        parse_html=False).add_to(map_Toronto)  
map_Toronto

In [16]:
#Defining Foursquare credentials and version
CLIENT_ID = 'XH32H40ZP2MRUPOYX3HYYFQYWRUVPDTPIJBRFU1G3Q2V2RLH' 
CLIENT_SECRET = 'QWADZD1OW1Y5VLJDBJ4HVVSSZWTTZKNQ5NHVOSPVZRYQIJP1' 
ACCESS_TOKEN = 'MC0DCSLO2L4N4Z5P1AM0CFQQ5EGYF003DPQYFDTLYS1KMWLL' 
VERSION = '20180604'
LIMIT = 30

In [17]:
#Installing Scikit-learn libraries to perform KMeans clustering 
!conda install -c conda-forge scikit-learn --yes
from sklearn.cluster import KMeans

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: \ 
Found conflicts! Looking for incompatible packages.
This can take several minutes.  Press CTRL-C to abort.
                                                                                                                     \failed

UnsatisfiableError: The following specifications were found
to be incompatible with the existing python installation in your environment:

Specifications:

  - cffi -> python[version='2.7.*|3.5.*|3.6.*|3.6.12|3.6.12|>=3.6,<3.7.0a0|>=3.7,<3.8.0a0|>=3.9,<3.10.0a0|>=3.8,<3.9.0a0|3.7.9|3.6.9|3.6.9|3.6.9|>=2.7,<2.8.0a0|3.6.9|>=3.5,<3.6.0a0|3.4.*',build='2_73_p

In [21]:
# Clustering neighborhoods into 5 clusters using K-means
df_to=df_new[df_new['Borough']=='Downtown Toronto'].reset_index(drop=True)
k=5
df_to=df_to.drop(['Postal Code','Borough','Neighborhood'],1)
KMeans_to=KMeans(n_clusters=k, random_state=0).fit(df_to)
KMeans_to.labels_

array([2, 2, 2, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 3, 4, 0, 0, 1, 3],
      dtype=int32)

In [22]:
# Merging KMeans label into previous dataset to create resultant dataset for visualization
df_to=df_new[df_new['Borough']=='Downtown Toronto'].reset_index(drop=True)
df_to.insert(0,'Cluster number', KMeans_DT.labels_)
df_to

Unnamed: 0,Cluster number,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,2,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,2,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
2,2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
5,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,3,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,0,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752


In [23]:
#Visualizing the resulting clusters 
import matplotlib.cm as cm
import matplotlib.colors as colors
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# Setting color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Adding markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_to['Latitude'], df_to['Longitude'], df_to['Neighborhood'], df_to['Cluster number']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.9).add_to(map_clusters)      
map_clusters