# Segmenting and Clustering Neighborhoods

## Import Data

In [69]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests

In [65]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [63]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [70]:
from bs4 import BeautifulSoup

In [71]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [14]:
pip install geopy

Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/53/fc/3d1b47e8e82ea12c25203929efb1b964918a77067a874b2c7631e2ec35ec/geopy-1.21.0-py2.py3-none-any.whl (104kB)
[K     |████████████████████████████████| 112kB 8.7MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.21.0
Note: you may need to restart the kernel to use updated packages.


In [72]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

# conda install -c conda-forge geopy 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# import k-means from clustering stage
from sklearn.cluster import KMeans

# conda install -c conda-forge folium
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [73]:
# gather HTML data via request

data = requests.get('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050').text
#data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# parse the data from html into a Beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [28]:
# To store data
postalCodeList = []
boroughList = []
neighbourhoodList = []

In [31]:
# append data to each respective list

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    #if 'Not assigned' in cells:
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighbourhoodList.append(cells[2].text.rstrip('\n')) # avoid new lines in neighborhood cell

In [37]:
# create a new DataFrame from the three lists

toronto_df1 = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighbourhood": neighbourhoodList})

toronto_df1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [64]:
## Remove the "Not assigned"

In [39]:
# drop cells with a Borough that is 'Not assigned'

toronto_df2 = toronto_df1.replace('Not assigned', np.nan)
toronto_df2 = toronto_df2.dropna(subset=['Borough'])
toronto_df2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


## Groupby Neighborhood Postcodes

In [46]:
# group neighbourhoods in the same borough

toronto_df2_grouped = toronto_df2.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df2_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern, Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union, Highla..."
2,M1E,Scarborough,"Guildwood, Morningside, West Hill, Guildwood, ..."
3,M1G,Scarborough,"Woburn, Woburn"
4,M1H,Scarborough,"Cedarbrae, Cedarbrae"


In [48]:
# when Neighbourhood is 'Not assigned' make the value same as Borough

for index, row in toronto_df2_grouped.iterrows():
    if row["Neighbourhood"] == "Not assigned":
        row["Neighbourhood"] = row["Borough"]
toronto_df2_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern, Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union, Highla..."
2,M1E,Scarborough,"Guildwood, Morningside, West Hill, Guildwood, ..."
3,M1G,Scarborough,"Woburn, Woburn"
4,M1H,Scarborough,"Cedarbrae, Cedarbrae"


In [49]:
# print the number of rows in the DataFrame

toronto_df3 = toronto_df2_grouped
toronto_df3.shape

(103, 3)

In [50]:
coordinates = pd.read_csv('http://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [51]:
# rename column to 'PostalCode' for easy merge

coordinates.rename(columns= {"Postal Code": "PostalCode"}, inplace=True)

In [52]:
# merge coordinates with toronto neighbourhood data set

toronto_df4 = toronto_df3.merge(coordinates, on = "PostalCode", how = "left")
toronto_df4.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern, Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union, Highla...",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill, Guildwood, ...",43.763573,-79.188711
3,M1G,Scarborough,"Woburn, Woburn",43.770992,-79.216917
4,M1H,Scarborough,"Cedarbrae, Cedarbrae",43.773136,-79.239476


In [55]:
# use geopy to find the geographical coordinates of Toronto (GTA)

address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

## Map of Toronto

In [56]:
# map of Toronto
t_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df4['Latitude'], toronto_df4['Longitude'], toronto_df4['Borough'], toronto_df4['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(t_map)  
    
t_map

## Neighborhoods

In [57]:
borough_names = list(toronto_df4.Borough.unique())

tdot_hoods = []

for x in borough_names:
    if "toronto" in x.lower():
        tdot_hoods.append(x)
        
tdot_hoods

['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']

In [58]:
# create a new DataFrame with only boroughs that contain the word Toronto
tdot = toronto_df4[toronto_df4['Borough'].isin(tdot_hoods)].reset_index(drop=True)
tdot.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,"The Beaches, The Beaches",43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale, The Danforth Wes...",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar, The Beaches We...",43.668999,-79.315572
3,M4M,East Toronto,"Studio District, Studio District",43.659526,-79.340923
4,M4N,Central Toronto,"Lawrence Park, Lawrence Park",43.72802,-79.38879


In [75]:
# re-create the same map with only Toronto neighbourhoods

map_tdot = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(tdot['Latitude'], tdot['Longitude'], tdot['Borough'], tdot['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_tdot)  
    
map_tdot

In [76]:
# get the name of the first neighbourhood

tdot.loc[0, 'Neighbourhood']

'The Beaches, The Beaches'

In [77]:
neighbourhood_latitude = tdot.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = tdot.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = tdot.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of The Beaches, The Beaches are 43.67635739999999, -79.2930312.


In [78]:
radius = 500
LIMIT = 100

nearby_venues = []