# Segmenting and Clustering Neighborhoods in Toronto

Part 1

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests # library to handle requests
from bs4 import BeautifulSoup

In [2]:
# define the corresponding URL
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
url

'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
# send the GET Request and examine the results
results = requests.get(url).text

In [4]:
soup = BeautifulSoup(results, 'lxml')

In [5]:
table = soup.find_all('table')[0]

In [6]:
# define the dataframe columns
column_names = ['Postalcode','Borough','Neighborhood'] 

# instantiate the dataframe
df = pd.DataFrame(columns=column_names)

In [7]:
for tr in table.find_all('tr'):
    row = []
    for td in tr.find_all('td'):
        row.append(td.text.strip())
    if len(row):
        df.loc[len(df)] = row

In [8]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [9]:
# ignore cells with a borough that is Not assigned
df = df[df['Borough'] != 'Not assigned']

In [10]:
# replacing the "Not assigned" value in Neighbourhood
df.loc[df['Neighborhood'] == ('Not assigned'), 'Neighborhood'] = df['Borough']
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [11]:
df_postcode = df.groupby(['Postalcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df2 = pd.DataFrame(df_postcode)
df3 = df2.reset_index()
df3.head()

Unnamed: 0,index,Postalcode,Borough,Neighborhood
0,0,M1B,Scarborough,"Rouge, Malvern"
1,1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,3,M1G,Scarborough,Woburn
4,4,M1H,Scarborough,Cedarbrae


In [12]:
df3.rename(columns={'Neighborhood':'Neighborhoodlist'},inplace=True)
df4 = pd.merge(df, df3, on='Postalcode')
df4.drop(['Neighborhood'],axis=1,inplace=True)
df4.drop_duplicates(inplace=True)
df4.rename(columns={'Neighborhoodlist':'Neighborhood'},inplace=True)
df4.head()

Unnamed: 0,Postalcode,Borough_x,index,Borough_y,Neighborhood
0,M3A,North York,25,North York,Parkwoods
1,M4A,North York,34,North York,Victoria Village
2,M5A,Downtown Toronto,53,Downtown Toronto,Harbourfront
3,M6A,North York,71,North York,"Lawrence Heights, Lawrence Manor"
5,M7A,Downtown Toronto,85,Downtown Toronto,Queen's Park


In [13]:
df4.drop(['Borough_y'],axis=1,inplace=True)
df4.rename(columns={'Borough_x':'Borough'},inplace=True)
df4.head()

Unnamed: 0,Postalcode,Borough,index,Neighborhood
0,M3A,North York,25,Parkwoods
1,M4A,North York,34,Victoria Village
2,M5A,Downtown Toronto,53,Harbourfront
3,M6A,North York,71,"Lawrence Heights, Lawrence Manor"
5,M7A,Downtown Toronto,85,Queen's Park


In [14]:
df4.drop(['index'],axis=1,inplace=True)
df4.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
5,M7A,Downtown Toronto,Queen's Park


In [15]:
df4.shape

(103, 3)

Part 2

In [16]:
!pip install geocoder



In [17]:
import geocoder # import geocoder

In [18]:
def get_latling(postal_code):
    
    # initialize your variable to None
    lat_lng_coords = None
    
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longtitude

In [19]:
geo = pd.read_csv('http://cocl.us/Geospatial_data')

In [20]:
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
geo_merge = pd.merge(geo, df4, left_on='Postal Code', right_on='Postalcode')
geo_data=geo_merge[['Postalcode','Borough','Neighborhood','Latitude','Longitude']]
geo_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Part 3

In [22]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.21.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

In [23]:
CLIENT_ID = 'K5FZAUUEYNOXBCHWGT402FDPV3MXD2EMTMUB4AMZHFIKNNA0'
CLIENT_SECRET = '2XATBMMXQYQ1ZZMXIENPOPXZJCV31MWXCB3S4RCKAMCJPDE5'
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: K5FZAUUEYNOXBCHWGT402FDPV3MXD2EMTMUB4AMZHFIKNNA0
CLIENT_SECRET:2XATBMMXQYQ1ZZMXIENPOPXZJCV31MWXCB3S4RCKAMCJPDE5


In [24]:
address = 'Toronto'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.653963 -79.387207


In [33]:
venues_map = folium.Map(location=[latitude, longitude], zoom_start=13) # generate map centred around the Conrad Hotel

# add a red circle marker to represent the Conrad Hotel
folium.features.CircleMarker(
    [latitude, longitude],
    radius=10,
    color='red',
    popup='Conrad Hotel',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(venues_map)

# add the Italian restaurants as blue circle markers
for lat, lng, label in zip(geo_data['Latitude'], geo_data['Longitude'], geo_data['Neighborhood']):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)

# display map
venues_map