### 1.1 Get original DataFrame from Wiki

In [2]:
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ------------------------------------------------------------
                       

In [54]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)[0]
tables.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### 1.2 Remove rows whose Borough is remarked by "Not assigned"

In [27]:
tables_filter = tables.drop(tables[tables["Borough"] == "Not assigned"].index)
tables_filter.reset_index(drop = True)
tables_filter.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### 1.3 Group by "Borough" and "Postcode" in order to merge "Neighbourhood"

In [28]:
tables_merged = pd.DataFrame()
for i in tables_filter.groupby(["Borough","Postcode"]):
    tables_merged = tables_merged.append({"Borough": i[0][0], "Postcode": i[0][1], "Neighbourhood": ",".join(set(i[1]["Neighbourhood"]))},ignore_index = True)

In [29]:
tables_merged = tables_merged[["Postcode", "Borough","Neighbourhood"]]

In [30]:
for i, value in tables_merged.iterrows():
    if value["Neighbourhood"] == "Not assigned":
        tables_merged[i]["Neighbourhood"] = tables_merged[i]["Borough"]

In [31]:
tables_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M4N,Central Toronto,Lawrence Park
1,M4P,Central Toronto,Davisville North
2,M4R,Central Toronto,North Toronto West
3,M4S,Central Toronto,Davisville
4,M4T,Central Toronto,"Moore Park,Summerhill East"


In [32]:
tables_merged.shape

(103, 3)

### 2.1 Get geographical coordinates of the neighborhoods by each postal code

In [33]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [34]:
Latitude_list = []
Longitude_list = []
for i, value in tables_merged.iterrows():
    postal_code = value["Postcode"]
    Latitude = df_geo_coor[df_geo_coor["Postal Code"] == postal_code]["Latitude"]
    Longitude = df_geo_coor[df_geo_coor["Postal Code"] == postal_code]["Longitude"]
    Latitude_list.append(Latitude.values[0])
    Longitude_list.append(Longitude.values[0])
    
tables_merged["Latitude"] = Latitude_list
tables_merged["Longitude"] = Longitude_list

In [35]:
tables_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316


In [36]:
import requests # library to handle requests

In [37]:
CLIENT_ID = 'ZJLDAZ3WP20OIO1SE3C2LEDFQLKDPAWEXFVWN0PVQ1HD5AQJ' # your Foursquare ID
CLIENT_SECRET = 'EUSQQXHATVYTT3GQ0OAK4550T0ILKHMJA41Z2SG1JZZAWN2F' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
radius = 500
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ZJLDAZ3WP20OIO1SE3C2LEDFQLKDPAWEXFVWN0PVQ1HD5AQJ
CLIENT_SECRET:EUSQQXHATVYTT3GQ0OAK4550T0ILKHMJA41Z2SG1JZZAWN2F


In [38]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']

        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [39]:
# type your answer here

tables_merged_venues = getNearbyVenues(names=tables_merged['Neighbourhood'],
                                   latitudes=tables_merged['Latitude'],
                                   longitudes=tables_merged['Longitude']
                                  )



In [40]:
print(tables_merged_venues.shape)
tables_merged_venues.head()

(1335, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Lake,43.72791,-79.386857,Lake
2,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
3,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
4,Davisville North,43.712751,-79.390197,Summerhill Market North,43.715499,-79.392881,Food & Drink Shop


In [41]:
# one hot encoding
tables_merged_venues_onehot = pd.get_dummies(tables_merged_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tables_merged_venues_onehot['Neighbourhood'] = tables_merged_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [tables_merged_venues_onehot.columns[-1]] + list(tables_merged_venues_onehot.columns[:-1])
tables_merged_venues_onehot = tables_merged_venues_onehot[fixed_columns]

tables_merged_venues_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Art Gallery,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
tables_merged_venues_onehot.shape

(1335, 232)

In [43]:
tables_merged_grouped = tables_merged_venues_onehot.groupby('Neighbourhood').mean().reset_index()
tables_merged_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Art Gallery,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
1,"Agincourt North,Steeles East,Milliken,L'Amorea...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
2,"Albion Gardens,Thistletown,Mount Olive,Silvers...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
3,"Alderwood,Long Branch",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
4,Bayview Village,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
5,"Bedford Park,Lawrence Manor East",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.041667,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
6,Berczy Park,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033333,...,0.033333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
7,"Bloordale Gardens,Old Burnhamthorpe,Eringate,M...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000
8,Business Reply Mail Processing Centre 969 Eastern,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.052632
9,"Cabbagetown,St. James Town",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000


In [44]:
num_top_venues = 5

for hood in tables_merged_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = tables_merged_grouped[tables_merged_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0             Breakfast Spot  0.25
1                     Lounge  0.25
2  Latin American Restaurant  0.25
3             Clothing Store  0.25
4              Movie Theater  0.00


----Agincourt North,Steeles East,Milliken,L'Amoreaux East----
               venue  freq
0         Playground   0.5
1               Park   0.5
2  Accessories Store   0.0
3              Motel   0.0
4     Massage Studio   0.0


----Albion Gardens,Thistletown,Mount Olive,Silverstone,Humbergate,South Steeles,Beaumond Heights,Jamestown----
                  venue  freq
0         Grocery Store  0.22
1           Pizza Place  0.22
2  Fast Food Restaurant  0.11
3              Pharmacy  0.11
4            Beer Store  0.11


----Alderwood,Long Branch----
                venue  freq
0         Pizza Place   0.2
1                 Gym   0.1
2  Athletics & Sports   0.1
3                Pool   0.1
4         Coffee Shop   0.1


----Bayview Village----
                 venue  fre

In [95]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [103]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = tables_merged_grouped['Neighbourhood']

for ind in np.arange(tables_merged_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tables_merged_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Clothing Store,Breakfast Spot,Diner,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant
1,"Agincourt North,Steeles East,Milliken,L'Amorea...",Park,Playground,Yoga Studio,Dessert Shop,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop
2,"Albion Gardens,Thistletown,Mount Olive,Silvers...",Grocery Store,Pizza Place,Beer Store,Fried Chicken Joint,Fast Food Restaurant,Sandwich Place,Pharmacy,Airport Terminal,Dim Sum Restaurant,Ethiopian Restaurant
3,"Alderwood,Long Branch",Pizza Place,Pub,Pharmacy,Sandwich Place,Pool,Athletics & Sports,Skating Rink,Gym,Coffee Shop,Department Store
4,Bayview Village,Japanese Restaurant,Chinese Restaurant,Café,Bank,Yoga Studio,Discount Store,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant


In [104]:
# set number of clusters
kclusters = 5

tables_merged_grouped_clustering = tables_merged_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tables_merged_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 1, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

In [105]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tables_merged_ = tables_merged

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
tables_merged_ = tables_merged_.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

tables_merged_ = tables_merged_.fillna(0)
tables_merged_["Cluster Labels"] = tables_merged_["Cluster Labels"].astype("int64")
tables_merged_.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1,Park,Lake,Bus Line,Swim School,Event Space,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197,3,Park,Convenience Store,Sandwich Place,Food & Drink Shop,Hotel,Department Store,Breakfast Spot,Gym,Discount Store,Empanada Restaurant
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,3,Clothing Store,Coffee Shop,Sporting Goods Shop,Gym / Fitness Center,Fast Food Restaurant,Diner,Dessert Shop,Mexican Restaurant,Park,Chinese Restaurant
3,M4S,Central Toronto,Davisville,43.704324,-79.38879,3,Dessert Shop,Sandwich Place,Coffee Shop,Italian Restaurant,Gym,Café,Sushi Restaurant,Pizza Place,Park,Thai Restaurant
4,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316,2,Playground,Trail,Yoga Studio,Department Store,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop


In [106]:
tables_merged_["Cluster Labels"].value_counts()

3    77
1    13
2     7
0     4
4     2
Name: Cluster Labels, dtype: int64

In [107]:

address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [109]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tables_merged_['Latitude'], tables_merged_['Longitude'], tables_merged_['Neighbourhood'], tables_merged_['Cluster Labels']):

    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters