# Segmenting and Clustering Neighborhoods in Toronto

## Scrape Neighbourhood Data From Wikipedia

In [397]:
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup as bs # library to parse HTML documents

url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data  = requests.get(url).text
soup = bs(data,"html5lib")

In [398]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['Postal Code'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighbourhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [399]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [400]:
df.shape

(103, 3)

## Combine Address Details into One Column for Geolocator

In [401]:
df['Address'] = df[df.columns[0:]].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Address
0,M3A,North York,Parkwoods,"M3A,North York,Parkwoods"
1,M4A,North York,Victoria Village,"M4A,North York,Victoria Village"
2,M5A,Downtown Toronto,"Regent Park, Harbourfront","M5A,Downtown Toronto,Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights","M6A,North York,Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government,"M7A,Queen's Park,Ontario Provincial Government"
...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North","M8X,Etobicoke,The Kingsway, Montgomery Road, O..."
99,M4Y,Downtown Toronto,Church and Wellesley,"M4Y,Downtown Toronto,Church and Wellesley"
100,M7Y,East Toronto Business,Enclave of M4L,"M7Y,East Toronto Business,Enclave of M4L"
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...","M8Y,Etobicoke,Old Mill South, King's Mill Park..."


In [402]:
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim

locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
df['location'] = df['Address'].apply(geocode)
df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)

In [403]:
df 

Unnamed: 0,Postal Code,Borough,Neighbourhood,Address,location,point,latitude,longitude,altitude
0,M3A,North York,Parkwoods,"M3A,North York,Parkwoods","(Parkwoods Village Drive, Don Valley East, Nor...","(43.7578464, -79.3159749, 0.0)",43.757846,-79.315975,0.0
1,M4A,North York,Victoria Village,"M4A,North York,Victoria Village",,,,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront","M5A,Downtown Toronto,Regent Park, Harbourfront",,,,,
3,M6A,North York,"Lawrence Manor, Lawrence Heights","M6A,North York,Lawrence Manor, Lawrence Heights",,,,,
4,M7A,Queen's Park,Ontario Provincial Government,"M7A,Queen's Park,Ontario Provincial Government",,,,,
...,...,...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North","M8X,Etobicoke,The Kingsway, Montgomery Road, O...",,,,,
99,M4Y,Downtown Toronto,Church and Wellesley,"M4Y,Downtown Toronto,Church and Wellesley",,,,,
100,M7Y,East Toronto Business,Enclave of M4L,"M7Y,East Toronto Business,Enclave of M4L",,,,,
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...","M8Y,Etobicoke,Old Mill South, King's Mill Park...",,,,,


## Sometimes the geocoder does not fill in the location values because the addresses are ambiguous

### In that case download the following file and run this code

In [404]:
# url = https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv
latlng = pd.read_csv(r'')
latlng

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [405]:
df = pd.merge(df, latlng, on='Postal Code')
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Address,location,point,latitude,longitude,altitude,Latitude,Longitude
0,M3A,North York,Parkwoods,"M3A,North York,Parkwoods","(Parkwoods Village Drive, Don Valley East, Nor...","(43.7578464, -79.3159749, 0.0)",43.757846,-79.315975,0.0,43.753259,-79.329656
1,M4A,North York,Victoria Village,"M4A,North York,Victoria Village",,,,,,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront","M5A,Downtown Toronto,Regent Park, Harbourfront",,,,,,43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights","M6A,North York,Lawrence Manor, Lawrence Heights",,,,,,43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,"M7A,Queen's Park,Ontario Provincial Government",,,,,,43.662301,-79.389494
...,...,...,...,...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North","M8X,Etobicoke,The Kingsway, Montgomery Road, O...",,,,,,43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,"M4Y,Downtown Toronto,Church and Wellesley",,,,,,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,"M7Y,East Toronto Business,Enclave of M4L",,,,,,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...","M8Y,Etobicoke,Old Mill South, King's Mill Park...",,,,,,43.636258,-79.498509


In [406]:
df = df.drop(['Address', 'location', 'altitude', 'point', 'latitude', 'longitude'], axis=1)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## Find Geo Coordiantes for Toronto

In [407]:
from geopy.geocoders import Nominatim

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="myGeocoder")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronoto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronoto are 43.6534817, -79.3839347.


## Add Nieghbourhoods onto Toronto Map

In [408]:
import folium
map1 = folium.Map(
    location=[43.6534817, -79.3839347],
    tiles='cartodbpositron',
    zoom_start=11,
)
df.apply(lambda row:folium.CircleMarker(location=[row["Latitude"], row["Longitude"]]).add_to(map1), axis=1)
map1

## KMeans Clustering Based on Local Neighbourhood Location

### Remove Unwanted Columns

In [409]:
toronto_clustering = df.drop(['Postal Code','Borough','Neighbourhood'],1)
toronto_clustering

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.654260,-79.360636
3,43.718518,-79.464763
4,43.662301,-79.389494
...,...,...
98,43.653654,-79.506944
99,43.665860,-79.383160
100,43.662744,-79.321558
101,43.636258,-79.498509


## Perform Clustering on the Geo Coordinates

In [410]:
k=10
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
toronto_clustering.insert(0, 'Cluster Labels', kmeans.labels_)

In [411]:
toronto_clustering

Unnamed: 0,Cluster Labels,Latitude,Longitude
0,1,43.753259,-79.329656
1,1,43.725882,-79.315572
2,5,43.654260,-79.360636
3,0,43.718518,-79.464763
4,5,43.662301,-79.389494
...,...,...,...
98,2,43.653654,-79.506944
99,5,43.665860,-79.383160
100,1,43.662744,-79.321558
101,2,43.636258,-79.498509


## Map Local Neighbourhoods

In [412]:
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(toronto_clustering['Latitude'], toronto_clustering['Longitude'], toronto_clustering['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters