## Segmenting and Clustering Neighborhoods in Toronto
### Applied Data Science Capstone Week 3 Peer-Graded Assignment

In [1]:
import requests

In [2]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

## Scrape Wikipedia

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,'lxml')
My_table = soup.find('table',{'class':'wikitable sortable'})
links=My_table.find_all('tr')

In [4]:
My_table = soup.find('table',{'class':'wikitable sortable'})
links=My_table.find_all('tr')

### Locate Table and use tags to find postal code by Borough and Neighbourhood

In [5]:
import pandas as pd
import numpy as np
data = []
for row in links:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]  # to filter out bad rows
print(df.shape)
print(df.columns)
df.head(10)

(288, 3)
Index(['PostalCode', 'Borough', 'Neighbourhood'], dtype='object')


Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


### Remove "Not assigned" and then Aggregate

In [6]:
df.drop(df[df['Borough'] == 'Not assigned'].index,axis=0, inplace = True) 
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


### Combining the rows for same postal code

In [7]:
df_pbn = df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_pbn.columns = ['PostalCode', 'Borough', 'Neighbourhood']
df_pbn

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Replacing the values for Borough aand Neighbourhood

In [8]:
df_pbn['Neighbourhood'].replace('Not assigned', "Queen's Park", inplace=True)
df_pbn

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Final Shape of the table

In [9]:
df_pbn.shape

(103, 3)

In [10]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.columns = ['PostalCode', 'Latitude', 'Longitude']

In [11]:
df_pos = pd.merge(df_pbn, df_geo, on=['PostalCode'], how='inner')
df_tor = df_pos[['Borough', 'Neighbourhood', 'PostalCode', 'Latitude', 'Longitude']].copy()
df_tor.head()

Unnamed: 0,Borough,Neighbourhood,PostalCode,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [12]:
pip install geopy


Note: you may need to restart the kernel to use updated packages.


### geographical coordinates of the neighborhoods in the Toronto

In [13]:
from geopy.geocoders import Nominatim
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

  after removing the cwd from sys.path.


The geograpical coordinate of the City of Toronto are 43.653963, -79.387207.


### Explore and cluster the neighborhoods in Toronto

In [14]:
# create map of New York using latitude and longitude values
import folium
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Borough'], df_tor['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Define Foursquare Credentials and Version

In [15]:
CLIENT_ID = 'secret' # your Foursquare ID
CLIENT_SECRET = 'secret' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: CAKIKUUNLJVOKYRC4WAC2FD1QNEDOG0KJPX2ELOHGOFGQRAR
CLIENT_SECRET:DOO4UYAM2DEOXUKUXNYLB1SZON4QFMFERRHINY4YNLLZRUK2


#### Select only Toronto Neighbourhoods


In [16]:
df_t4 = df_tor[df_tor['Borough'].str.contains('Toronto')]

to_data = df_t4.reset_index(drop=True)
to_data

Unnamed: 0,Borough,Neighbourhood,PostalCode,Latitude,Longitude
0,East Toronto,The Beaches,M4E,43.676357,-79.293031
1,East Toronto,"The Danforth West, Riverdale",M4K,43.679557,-79.352188
2,East Toronto,"The Beaches West, India Bazaar",M4L,43.668999,-79.315572
3,East Toronto,Studio District,M4M,43.659526,-79.340923
4,Central Toronto,Lawrence Park,M4N,43.72802,-79.38879
5,Central Toronto,Davisville North,M4P,43.712751,-79.390197
6,Central Toronto,North Toronto West,M4R,43.715383,-79.405678
7,Central Toronto,Davisville,M4S,43.704324,-79.38879
8,Central Toronto,"Moore Park, Summerhill East",M4T,43.689574,-79.38316
9,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",M4V,43.686412,-79.400049


#### Map of Toronto Neighbourhoods only

In [17]:
map_neigh = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(to_data['Latitude'], to_data['Longitude'], to_data['Borough'], to_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_neigh

#### Explore the first neighbourhood

In [18]:
to_data.loc[0, 'Neighbourhood']

'The Beaches'

#### Get the neighborhood's latitude and longitude values.

In [19]:
neighbourhood_latitude = to_data.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude =to_data.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name =to_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                              neighbourhood_latitude, 
                                                             neighbourhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


#### Let's see what the top 100 venus are within a radius from the centroid of 500 meters
#### First, let's create the GET request URL. Name your URL url.

In [20]:
# type your answer here

LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url 

'https://api.foursquare.com/v2/venues/explore?&client_id=CAKIKUUNLJVOKYRC4WAC2FD1QNEDOG0KJPX2ELOHGOFGQRAR&client_secret=DOO4UYAM2DEOXUKUXNYLB1SZON4QFMFERRHINY4YNLLZRUK2&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'


#### Send the GET request and examine the resutls

In [21]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d979a1cd9a6e60037d04ed6'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 6,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

let's borrow the get_category_type function from the Foursquare lab.

In [22]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a pandas dataframe.

In [23]:
from pandas.io.json import json_normalize
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Glen Stewart Park,Park,43.675278,-79.294647
4,Glen Stewart Ravine,Other Great Outdoors,43.6763,-79.294784
5,Upper Beaches,Neighborhood,43.680563,-79.292869


In [24]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

6 venues were returned by Foursquare.
