In [2]:
## Import Libraries

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/hclee/Desktop/anaconda3

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.2               |     pyhd8ed1ab_0          26 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1h             |       haf1e3a3_0         1.9 MB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-

In [161]:
## Using pandas, read html using the given url  

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

toronto_df=pd.read_html(url, header=0)
toronto_df[0].to_csv('toronto.csv',index=False)
toronto_data=pd.read_csv('toronto.csv')

print (toronto_data.shape)
toronto_data.head()


(180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [162]:
## Remove rows with Borough == Not assigned

NABorough=toronto_data[toronto_data['Borough'] == 'Not assigned'].index
toronto_data.drop(NABorough, axis=0, inplace=True)
print(toronto_data.shape)
toronto_data.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [163]:
## Confirm there are no rows where Neighbourhood == Not assigned

NANeigh=toronto_data[toronto_data['Neighbourhood']=='Not assigned'].index
NANeigh.shape


(0,)

In [164]:
print("Answer to Part 1 is",toronto_data.shape)

Answer to Part 1 is (103, 3)


In [193]:
## install pgeocode to get latitudes and longitudes of postal codes
!pip install pgeocode
import pgeocode
pgeocode.Nominatim('ca')
geolocator = pgeocode.Nominatim('ca')
postal_codes = toronto_data['Postal Code'].tolist()
latitudes = []
longitudes = []
print("pgeocode downloaded")

pgeocode downloaded


In [194]:
for i, postal_code in enumerate(postal_codes):
    # initialize your variable to None
    #print(f'--Getting Postal Code: {postal_code}')
    g = geolocator.query_postal_code(postal_code)
    
    if not g.empty:
        #print(f'Postal Code {postal_code} has been retrieved. {len(postal_codes) - (i + 1)} codes left')
        latitudes.append(g.latitude)
        longitudes.append(g.longitude)


In [227]:
## check len of lists are the same before returning dataframe
print (len(postal_codes),len(latitudes),len(longitudes))

toronto_data['Neighbourhood Latitude']=latitudes
toronto_data['Neighbourhood Longitude']=longitudes
toronto_data.head(103)

103 103 103


Unnamed: 0,Postal Code,Borough,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude
2,M3A,North York,Parkwoods,43.7545,-79.33
3,M4A,North York,Victoria Village,43.7276,-79.3148
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6662,-79.5282
9,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
11,M3B,North York,Don Mills,43.745,-79.359
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7063,-79.3094
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783


In [234]:
## Focus only on North York
ny = toronto_data[toronto_data['Borough'] == 'North York'].reset_index(drop=True)
print (ny.shape)
ny.head() 

(24, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
3,M3B,North York,Don Mills,43.745,-79.359
4,M6B,North York,Glencairn,43.7081,-79.4479


In [222]:
## Make a call to Foursquare API and define getNearbyVenues function 
CLIENT_ID = 'R12BNMHALD4Y5F1E5IWIA2YMYTTUZP4E0NZRUHVSMLA2WEIM' # your Foursquare ID
CLIENT_SECRET = 'UST5LXSGJNS0YH2ACP3JNUEIIUUIGOUONRC5VWZLZI5ZB3W0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

## Define function to get venues

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [244]:
## explore venues within radius and up to LIMIT

ny_venues = getNearbyVenues(names = ny['Neighbourhood'], latitudes=ny['Neighbourhood Latitude'], longitudes=ny['Neighbourhood Longitude'])
print(ny_venues.shape) 
ny_venues.head()   

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale, Willowdale East
Downsview
York Mills West
Willowdale, Willowdale West
(319, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7545,-79.33,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.7545,-79.33,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.7276,-79.3148,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.7276,-79.3148,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.7276,-79.3148,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [270]:
## Count number of unique venue categories

ny_venues.groupby('Neighbourhood').count() 
print('There are {} unique categories.'.format(len(ny_venues['Venue Category'].unique())))
                                                             
# Use one hot encoding to separate Venue Category 
ny_onehot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ny_onehot['Neighbourhood'] = ny_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [ny_onehot.columns[-1]] + list(ny_onehot.columns[:-1])
ny_onehot = ny_onehot[fixed_columns]
print(ny_onehot.shape)
ny_onehot.head()  

There are 110 unique categories.
(319, 111)


Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bank,Bar,Baseball Field,Basketball Court,Beer Store,Bookstore,Boutique,Burger Joint,Burrito Place,Bus Station,Business Service,Butcher,Café,Caribbean Restaurant,Chinese Restaurant,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Distribution Center,Electronics Store,Fast Food Restaurant,Financial or Legal Service,Flower Shop,Food & Drink Shop,Food Court,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Gas Station,Gift Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Hockey Arena,Home Service,Hotel,Ice Cream Shop,Indian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Jewelry Store,Juice Bar,Karaoke Bar,Kitchen Supply Store,Korean Restaurant,Latin American Restaurant,Leather Goods Store,Lingerie Store,Liquor Store,Lounge,Massage Studio,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Movie Theater,Moving Target,New American Restaurant,Nightclub,Park,Pet Store,Pharmacy,Pizza Place,Platform,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Ramen Restaurant,Rental Car Location,Residential Building (Apartment / Condo),Restaurant,River,Salon / Barbershop,Sandwich Place,Shoe Store,Shopping Mall,Skating Rink,Spa,Sporting Goods Shop,Sports Bar,Steakhouse,Sushi Restaurant,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [271]:
## Group by neighborhood and return the mean per category 
ny_grouped = ny_onehot.groupby('Neighbourhood').mean().reset_index()
print (ny_grouped.shape)  

## return top 5 most common venues per Neighborhood

num_top_venues = 5

for hood in ny_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = ny_grouped[ny_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')   
    



(20, 111)
----Bathurst Manor, Wilson Heights, Downsview North----
                       venue  freq
0        Fried Chicken Joint  0.17
1                Pizza Place  0.17
2                Coffee Shop  0.17
3   Mediterranean Restaurant  0.17
4  Middle Eastern Restaurant  0.17


----Bayview Village----
               venue  freq
0              Trail  0.50
1               Park  0.25
2        Gas Station  0.25
3  Accessories Store  0.00
4        Men's Store  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0  Italian Restaurant  0.08
1      Sandwich Place  0.08
2         Coffee Shop  0.08
3     Thai Restaurant  0.04
4    Sushi Restaurant  0.04


----Don Mills----
                        venue  freq
0                        Park  0.29
1                       Trail  0.14
2  Construction & Landscaping  0.14
3                        Pool  0.14
4                         Gym  0.14


----Downsview----
            venue  freq
0   Shopping Mall  0.07
1     Pizza Place  0

In [307]:
## select only the top venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
## Create a new dataframe to display the top 10 venues per Neighborhood, note indicator code

num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
ny_venues_sorted = pd.DataFrame(columns=columns)
ny_venues_sorted['Neighbourhood'] = ny_grouped['Neighbourhood']

for ind in np.arange(ny_grouped.shape[0]):
    ny_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_grouped.iloc[ind, :], num_top_venues)
print(ny_venues.shape)
ny_venues_sorted.head()  



(319, 7)


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Pizza Place,Mediterranean Restaurant,Deli / Bodega,Coffee Shop,Middle Eastern Restaurant
1,Bayview Village,Trail,Park,Gas Station,Women's Store,Food Court
2,"Bedford Park, Lawrence Manor East",Sandwich Place,Italian Restaurant,Coffee Shop,Women's Store,Comfort Food Restaurant
3,Don Mills,Park,River,Trail,Gym,Pool
4,Downsview,Discount Store,Coffee Shop,Park,Grocery Store,Pizza Place


In [310]:
## Run k-means clustering

kclusters = 8

ny_grouped_clustering = ny_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ny_grouped_clustering)

# check cluster labels generated for each row in the dataframe

kmeans.labels_[0:]   


array([2, 6, 2, 2, 2, 2, 2, 5, 1, 2, 2, 6, 2, 7, 2, 4, 2, 2, 0, 3],
      dtype=int32)

In [313]:
# add clustering labels back by merging the array into the dataframe

ny_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ny_merged = ny

# merge ny_grouped with ny to add latitude/longitude for each neighborhood
ny_merged = ny_merged.join(ny_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

ny_merged.head()




Unnamed: 0,Postal Code,Borough,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.7545,-79.33,7,Park,Food & Drink Shop,Women's Store,Food Court,Deli / Bodega
1,M4A,North York,Victoria Village,43.7276,-79.3148,2,Intersection,Portuguese Restaurant,Hockey Arena,French Restaurant,Financial or Legal Service
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504,2,Clothing Store,Coffee Shop,Women's Store,Restaurant,Food Court
3,M3B,North York,Don Mills,43.745,-79.359,2,Park,River,Trail,Gym,Pool
4,M6B,North York,Glencairn,43.7081,-79.4479,2,Pizza Place,Ice Cream Shop,Bakery,Fast Food Restaurant,Gas Station


In [315]:
# create map, clustering venues by category rather than location 

address = 'North York, Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))  



The geograpical coordinate of North York are 43.7543263, -79.44911696639593.


In [316]:
map_clusters = folium.Map(location=[latitude,longitude],zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_merged['Neighbourhood Latitude'], ny_merged['Neighbourhood Longitude'], ny_merged['Neighbourhood'], ny_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [318]:
## Examine Cluster Label=2 which is the most common cluster

ny_merged.loc[ny_merged['Cluster Labels'] == 2, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]  


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,North York,2,Intersection,Portuguese Restaurant,Hockey Arena,French Restaurant,Financial or Legal Service
2,North York,2,Clothing Store,Coffee Shop,Women's Store,Restaurant,Food Court
3,North York,2,Park,River,Trail,Gym,Pool
4,North York,2,Pizza Place,Ice Cream Shop,Bakery,Fast Food Restaurant,Gas Station
5,North York,2,Park,River,Trail,Gym,Pool
7,North York,2,Pizza Place,Mediterranean Restaurant,Deli / Bodega,Coffee Shop,Middle Eastern Restaurant
8,North York,2,Clothing Store,Fast Food Restaurant,Coffee Shop,Restaurant,Juice Bar
9,North York,2,Pizza Place,Middle Eastern Restaurant,Sports Bar,Massage Studio,Food & Drink Shop
11,North York,2,Discount Store,Coffee Shop,Park,Grocery Store,Pizza Place
13,North York,2,Discount Store,Coffee Shop,Park,Grocery Store,Pizza Place


In [336]:
print("Cluster 2 is the most common cluster and has many Coffee Shops followed by Discount Stores.\n")
print("THE END")

Cluster 2 is the most common cluster and has many Coffee Shops followed by Discount Stores.

THE END
