# Segmenting and Clustering Neighborhoods in Toronto
### Import Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import urllib
import urllib.request
import os
import pandas as pd
import csv
import numpy as np

### Create function

In [2]:
def make_soup(url):
    thepage = urllib.request.urlopen(url)
    soupdata = BeautifulSoup(thepage,"html.parser")
    return soupdata

### Request Data

In [3]:
#source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = make_soup('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

### Find Data
Find rows and columns, Create header, Create csv file

In [4]:
postcode=postcodesaved=""
table = soup.find_all('table')[0] # Grab the first table

for record in table.findAll('tr'):
    postcode=""
    for data in record.findAll('td'):
        postcode=postcode+","+data.text
    postcodesaved = postcodesaved + "\n" + postcode[1:]

header="Postcode,Borough,Neighbourhood"   
file=open(os.path.expanduser('PostCodes.csv'),"wb")
file.write(bytes(header,encoding='ascii',errors='ignore'))
file.write(bytes(postcodesaved,encoding='ascii',errors='ignore'))

9027

### Read csv file show Dataframe

In [5]:
df=pd.read_csv('PostCodes.csv')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Ignore cells with a borough that is Not assigned

In [6]:
df2=df[df.Borough != 'Not assigned']
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Change Neighbourhood 'Not assigned' by Borough name

In [7]:
df3=df2.replace('Not assigned', df2.Borough)
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Confirming there is no more 'Not assigned'

In [8]:
df3.loc[df3['Neighbourhood']== 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned


###  Combine two rows into one row with the neighborhoods separated with a comma

In [9]:
df5=df2.groupby(['Postcode','Borough'], as_index=False, sort=False).agg(','.join)
df5.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


### Do a check to verify count of Neighbourhoods per Postcode
Selecting a random Postcodes from below and compare the quantities from above

In [10]:
check=df2.groupby('Postcode').nunique()
check.head()

Unnamed: 0_level_0,Postcode,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M1B,1,1,2
M1C,1,1,3
M1E,1,1,3
M1G,1,1,1
M1H,1,1,1


In [11]:
df5.shape

(103, 3)

### Load Geographical Coordinates file
Generate Dataframe

In [12]:
dfgeo=pd.read_csv('Geo_Coor.csv')
dfgeo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge Dataframes

In [13]:
dfout=pd.merge(dfgeo,df5)
dfout.head(11)

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge,Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae
5,M1J,43.744734,-79.239476,Scarborough,Scarborough Village
6,M1K,43.727929,-79.262029,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,43.711112,-79.284577,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,43.716316,-79.239476,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,43.692657,-79.264848,Scarborough,"Birch Cliff,Cliffside West"


### Import libraries for mapping

In [16]:
import json 
from geopy.geocoders import Nominatim 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

### Latitude and Longitude of Toronto

In [18]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Generate Map of Toronto

In [42]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(dfout['Latitude'], dfout['Longitude'], dfout['Borough'], dfout['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto