# Coursera Capstone Project
## Week 3 Test 3: Segmenting and Clustering Neighborhoods in Toronto
<p>Focus on boroughs that contain Toronto. Cluster with number of bookstores around for each postal code.</p>

In [77]:
#Download files from the Internet. This script processes these local files.
#!wget -q -O 'canada_data.html' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
#!wget -q -O 'Geospatial_Coordinates.csv' http://cocl.us/Geospatial_data

In [78]:
#Scrape Wikipedia to retrieve Toronto's postal code and related info.
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
with open('canada_data.html') as html_doc:
    soup = BeautifulSoup(html_doc)
column_names = ['PostalCode', 'Borough', 'Neighborhood']
nhdf1 = pd.DataFrame(columns=column_names) # initialize a neighborhoods data frame
tbody = soup.tbody
for tr in tbody.find_all('tr'):
    nhr = list() # initialize a neighborhood record
    for td in tr.find_all('td'):
        nhr.append(td.text.strip('\n'))
    #print (nhr)
    if (len(nhr)>1 and nhr[1] != "Not assigned"):
        nhdf1 = nhdf1.append({'PostalCode': nhr[0], 'Borough': nhr[1],
                'Neighborhood': nhr[2]}, ignore_index=True)
nhgrp = nhdf1.groupby(['PostalCode','Borough'])['Neighborhood'].apply(list) # group nhr's to a Series
nhdf2 = nhgrp.reset_index() # convert the Series into a DataFrame

In [None]:
nhdf2

In [80]:
#Use latitudes and longitudes in a CSV because geocoder is unstable
gcdf = pd.read_csv("Geospatial_Coordinates.csv")

In [82]:
gcdf.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [83]:
gcdf.rename(columns={"Postal Code": "PostalCode"},inplace=True) # change the column name to match other df

In [84]:
gcdf.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [85]:
nhdf3 = pd.merge(nhdf2, gcdf, on='PostalCode')

In [None]:
nhdf3

In [87]:
#Focus on boroughs that contain 'Toronto'
nhdf4 = nhdf3[nhdf3['Borough'].str.contains("Toronto")]

In [None]:
nhdf4

In [89]:
nhdf5 = nhdf4.reset_index(drop=True)

In [None]:
nhdf5

In [19]:
# import k-means for clustering stage
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [24]:
#Calcurate the epicenter of boroughs (just taking arithmetic mean)
latitude = nhdf5['Latitude'].mean()
longitude = nhdf5['Longitude'].mean()
print (latitude, longitude)

43.667262184210514 -79.38988323421053


In [91]:
#Display an initial map
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)
map_toronto

In [92]:
#Display postal code locations on the map
for lat, lon, postal_code in zip(nhdf5['Latitude'],nhdf5['Longitude'],nhdf5['PostalCode']):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=postal_code,
        color='yellow',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7).add_to(map_toronto)
map_toronto

In [None]:
# Replace Foursquare ID and secret key with yours
# --- replaced them with dummy strings before uploading to GitHub
CLIENT_ID = 'your Foursquare ID' # 
CLIENT_SECRET = 'your Foursquare Secret' # 
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [94]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=APPUW5GROD55ZZXXEUNUXH01XQMJQJRZE5AKRAZZ3OAKC1JS&client_secret=OOW2Z40R3ZEZTHMERWODK2QACVZCYQ24I3P4ACRCJFTPVPT0&v=20180605&ll=43.667262184210514,-79.38988323421053&radius=500&limit=100'

In [95]:
import requests

In [None]:
results = requests.get(url).json()
results

In [98]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [99]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [100]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Indigo,Bookstore,43.669065,-79.389057
1,Bay Street Video,Video Store,43.66889,-79.389247
2,Harry Rosen Menswear,Men's Store,43.669661,-79.390585
3,Pi Co.,Pizza Place,43.670107,-79.389852
4,Windsor Arms Hotel,Hotel,43.668781,-79.39085


In [101]:
nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Indigo,Bookstore,43.669065,-79.389057
1,Bay Street Video,Video Store,43.668890,-79.389247
2,Harry Rosen Menswear,Men's Store,43.669661,-79.390585
3,Pi Co.,Pizza Place,43.670107,-79.389852
4,Windsor Arms Hotel,Hotel,43.668781,-79.390850
5,Japan Foundation,Art Gallery,43.668967,-79.392024
6,Crown Princess Fine Dining 伯爵名宴,Chinese Restaurant,43.666455,-79.387698
7,COS,Clothing Store,43.669516,-79.390390
8,Tiffany & Co.,Jewelry Store,43.669135,-79.393484
9,DanceLifeX Centre,Dance Studio,43.666956,-79.385297


In [102]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [103]:
toronto_venues = getNearbyVenues(names=nhdf5['PostalCode'],
                                   latitudes=nhdf5['Latitude'],
                                   longitudes=nhdf5['Longitude']
                                  )

M4E
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6G
M6H
M6J
M6K
M6P
M6R
M6S
M7Y


In [104]:
toronto_venues.tail(15)

Unnamed: 0,PostalCode,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
1677,M7Y,43.662744,-79.321558,Queen Margherita Pizza,43.664685,-79.324164,Pizza Place
1678,M7Y,43.662744,-79.321558,Chino Locos,43.664653,-79.325584,Burrito Place
1679,M7Y,43.662744,-79.321558,The Green Wood,43.664728,-79.324117,Restaurant
1680,M7Y,43.662744,-79.321558,Chick-n-Joy,43.665181,-79.321403,Fast Food Restaurant
1681,M7Y,43.662744,-79.321558,Ashbridges Bay Skatepark,43.662548,-79.315631,Skate Park
1682,M7Y,43.662744,-79.321558,East End Garden Centre & Hardware,43.664555,-79.324598,Garden Center
1683,M7Y,43.662744,-79.321558,Amin Car Repair Garage,43.663544,-79.32013,Auto Workshop
1684,M7Y,43.662744,-79.321558,The Ashbridge Estate,43.664691,-79.321805,Garden
1685,M7Y,43.662744,-79.321558,TTC Russell Division,43.664908,-79.32256,Light Rail Station
1686,M7Y,43.662744,-79.321558,Jonathan Ashbridge Park,43.664702,-79.319898,Park


In [105]:
toronto_venues['Venue Category'].unique().tolist()

['Health Food Store',
 'Pub',
 'Coffee Shop',
 'Neighborhood',
 'Greek Restaurant',
 'Ice Cream Shop',
 'Cosmetics Shop',
 'Italian Restaurant',
 'Brewery',
 'Yoga Studio',
 'Fruit & Vegetable Store',
 'Pizza Place',
 'Restaurant',
 'Bookstore',
 'Juice Bar',
 'Bubble Tea Shop',
 'Dessert Shop',
 'Indian Restaurant',
 'Trail',
 'Diner',
 'Spa',
 'Grocery Store',
 'Japanese Restaurant',
 'Bakery',
 'American Restaurant',
 'Sports Bar',
 'Caribbean Restaurant',
 'Café',
 'Liquor Store',
 'Furniture / Home Store',
 'Burger Joint',
 'Gym',
 'Fish & Chips Shop',
 'Steakhouse',
 'Park',
 'Sushi Restaurant',
 'Pet Store',
 'Burrito Place',
 'Fast Food Restaurant',
 'Movie Theater',
 'Sandwich Place',
 'Intersection',
 'Fish Market',
 'Chinese Restaurant',
 'Cheese Shop',
 'Comfort Food Restaurant',
 'Seafood Restaurant',
 'Middle Eastern Restaurant',
 'Stationery Store',
 'New American Restaurant',
 'Coworking Space',
 'Gastropub',
 'Music Store',
 'Bar',
 'Latin American Restaurant',
 'Conve

In [106]:
#I am interested in bookstores. I would like to cluster postal code locations with number of bookstores
#around.
toronto_venues[toronto_venues['Venue Category']=='Bookstore']

Unnamed: 0,PostalCode,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
19,M4K,43.679557,-79.352188,Re: Reading,43.678507,-79.347678,Bookstore
40,M4K,43.679557,-79.352188,Book City,43.677413,-79.352734,Bookstore
71,M4M,43.659526,-79.340923,Queen Books,43.660651,-79.342267,Bookstore
245,M4Y,43.66586,-79.38316,Glad Day Bookshop,43.665271,-79.380785,Bookstore
420,M5B,43.657162,-79.378937,Indigo,43.653515,-79.380696,Bookstore
550,M5C,43.651494,-79.375418,Indigo,43.653515,-79.380696,Bookstore
780,M5H,43.650571,-79.384568,Indigo,43.653515,-79.380696,Bookstore
938,M5K,43.647177,-79.381576,Indigospirit,43.64835,-79.380347,Bookstore
1052,M5L,43.648198,-79.379817,Indigospirit,43.64835,-79.380347,Bookstore
1143,M5S,43.662696,-79.400049,Bakka Phoenix Books,43.662959,-79.402601,Bookstore


In [107]:
bsdf1 = toronto_venues[toronto_venues['Venue Category']=='Bookstore'].groupby('PostalCode')['Venue Category'].count().reset_index()

In [108]:
bsdf1.rename(columns={'Venue Category':'Cluster'},inplace=True)

In [109]:
bsdf1

Unnamed: 0,PostalCode,Cluster
0,M4K,2
1,M4M,1
2,M4Y,1
3,M5B,1
4,M5C,1
5,M5H,1
6,M5K,1
7,M5L,1
8,M5S,2
9,M5W,1


In [None]:
#It happened to be that the number of bookstores are either one or two 
# (or zero for postal code locations that don't appear in the above dataframe (bsdf1).)
#There was no need to use KMeans clustering for my purpose.
#Just label cluster zero for those without bookstores in 500 m radius.

In [110]:
nhdf6 = pd.merge(nhdf5,bsdf1,how='left',on='PostalCode')

In [111]:
nhdf6

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M4E,East Toronto,[The Beaches],43.676357,-79.293031,
1,M4K,East Toronto,"[The Danforth West, Riverdale]",43.679557,-79.352188,2.0
2,M4L,East Toronto,"[The Beaches West, India Bazaar]",43.668999,-79.315572,
3,M4M,East Toronto,[Studio District],43.659526,-79.340923,1.0
4,M4N,Central Toronto,[Lawrence Park],43.72802,-79.38879,
5,M4P,Central Toronto,[Davisville North],43.712751,-79.390197,
6,M4R,Central Toronto,[North Toronto West],43.715383,-79.405678,
7,M4S,Central Toronto,[Davisville],43.704324,-79.38879,
8,M4T,Central Toronto,"[Moore Park, Summerhill East]",43.689574,-79.38316,
9,M4V,Central Toronto,"[Deer Park, Forest Hill SE, Rathnelly, South H...",43.686412,-79.400049,


In [112]:
nhdf6['Cluster'].replace({np.nan: 0.0},inplace=True)

In [113]:
nhdf6['Cluster'] = nhdf6['Cluster'].astype(int)

In [114]:
nhdf6

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M4E,East Toronto,[The Beaches],43.676357,-79.293031,0
1,M4K,East Toronto,"[The Danforth West, Riverdale]",43.679557,-79.352188,2
2,M4L,East Toronto,"[The Beaches West, India Bazaar]",43.668999,-79.315572,0
3,M4M,East Toronto,[Studio District],43.659526,-79.340923,1
4,M4N,Central Toronto,[Lawrence Park],43.72802,-79.38879,0
5,M4P,Central Toronto,[Davisville North],43.712751,-79.390197,0
6,M4R,Central Toronto,[North Toronto West],43.715383,-79.405678,0
7,M4S,Central Toronto,[Davisville],43.704324,-79.38879,0
8,M4T,Central Toronto,"[Moore Park, Summerhill East]",43.689574,-79.38316,0
9,M4V,Central Toronto,"[Deer Park, Forest Hill SE, Rathnelly, South H...",43.686412,-79.400049,0


In [115]:
# create a cluster map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)
# set color scheme for the clusters
colors=['black','yellow','green']
# add markers to the map
markers_colors = []
for lat, lon, pc, cluster in zip(nhdf6['Latitude'], nhdf6['Longitude'], nhdf6['PostalCode'], nhdf6['Cluster']):
    label = folium.Popup(pc + ' : ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=colors[cluster],
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.5).add_to(map_clusters)
       
map_clusters

In [None]:
# postal code locations in boroughs that contain 'Toronoto'
# Cluster with number of bookstores in 500 m radius.
# Color: Black - 0 bookstores, Yellow - 1 bookstore, Green - 2 bookstores