# Capstone Clustering Week 3 Part 1

In [1]:
import numpy as np

In [2]:
import pandas as pd
import json

In [3]:
from geopy.geocoders import Nominatim

In [4]:
import requests

from bs4 import BeautifulSoup

In [5]:
from pandas.io.json import json_normalize

In [6]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [7]:
from sklearn.cluster import KMeans

In [9]:
!pip install folium
import folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 8.0MB/s eta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.10.1


In [10]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [11]:
BS = BeautifulSoup(data, 'html.parser')

In [12]:
postalCode = []
borough = []
neighborhood = []

In [15]:
BS.find('table').find_all('tr')
BS.find('table').find_all('tr')
for row in BS.find('table').find_all('tr'):
    cells = row.find_all('td')
    
    if(len(cells) > 0):
        postalCode.append(cells[0].text)
        borough.append(cells[1].text)
        neighborhood.append(cells[2].text.rstrip('\n'))

In [16]:
neighbourhood_df = pd.DataFrame({"PostalCode": postalCode,
                           "Borough": borough,
                           "Neighborhood": neighborhood})

neighbourhood_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Drop cells with a borough that is "Not assigned"

In [17]:
neighbourhood_df_dropna = neighbourhood_df[neighbourhood_df.Borough != "Not assigned"].reset_index(drop=True)
neighbourhood_df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


Group Neighbourhood

In [18]:
neighbourhood_df_group = neighbourhood_df_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
neighbourhood_df_group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [19]:
neighbourhood_df_group.shape

(103, 3)

# Get Coordinates from Coursera

In [21]:
coordinates = pd.read_csv("https://cocl.us/Geospatial_data")
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [23]:
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [25]:
NB_df_new = neighbourhood_df_group.merge(coordinates, on="PostalCode", how="left")
NB_df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [26]:
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

## Superimposing map of neighbourhood to Toronto

In [27]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(NB_df_new['Latitude'], NB_df_new['Longitude'], NB_df_new['Borough'], NB_df_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

## Selecting only boroughs that contain the word Toronto

In [28]:
Toronto_only = list(NB_df_new.Borough.unique())

borough_toronto = []

for x in Toronto_only:
    if "toronto" in x.lower():
        borough_toronto.append(x)
        
borough_toronto

['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']

In [29]:
T_df = NB_df_new[NB_df_new['Borough'].isin(borough_toronto)].reset_index(drop=True)

In [30]:
print(T_df.shape)

(39, 5)


In [31]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(T_df['Latitude'], T_df['Longitude'], T_df['Borough'], T_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

## Assessing Foursquares

In [32]:
Client_ID = 'NP2K31BJN3TVECXLEPSMH43BXVWK3KQCJPRZ1YPN2F5LBAUH'
Client_secret = 'CFRA313EAM03Z32UPOE12IOM3SUHV25HC435QOT5CUVWII5W'
Version = '20180605'

## Top 100 places within 300m radius

In [34]:
radius = 300
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(T_df['Latitude'], T_df['Longitude'], T_df['PostalCode'], T_df['Borough'], T_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        Client_ID,
        Client_secret,
        Version,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [35]:
Places_df = pd.DataFrame(venues)

In [36]:
Places_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(Places_df.shape)
Places_df.head()

(936, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
3,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
4,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop


### Groupby Postal Code

In [37]:
Places_df.groupby(["PostalCode", "Borough", "Neighborhood"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
PostalCode,Borough,Neighborhood,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M4E,East Toronto,The Beaches,3,3,3,3,3,3
M4K,East Toronto,"The Danforth West, Riverdale",23,23,23,23,23,23
M4L,East Toronto,"The Beaches West, India Bazaar",17,17,17,17,17,17
M4M,East Toronto,Studio District,27,27,27,27,27,27
M4N,Central Toronto,Lawrence Park,3,3,3,3,3,3
M4P,Central Toronto,Davisville North,4,4,4,4,4,4
M4R,Central Toronto,North Toronto West,1,1,1,1,1,1
M4S,Central Toronto,Davisville,22,22,22,22,22,22
M4T,Central Toronto,"Moore Park, Summerhill East",3,3,3,3,3,3
M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West",6,6,6,6,6,6


### Identifying Unique Places

In [38]:
Places_df['VenueCategory'].unique()[:50]

array(['Trail', 'Park', 'Other Great Outdoors', 'Greek Restaurant',
       'Cosmetics Shop', 'Ice Cream Shop', 'Italian Restaurant',
       'Yoga Studio', 'Brewery', 'Fruit & Vegetable Store', 'Pub',
       'Juice Bar', 'Restaurant', 'Bubble Tea Shop', 'Diner',
       'Dessert Shop', 'Indian Restaurant', 'Coffee Shop',
       'Fish & Chips Shop', 'Burger Joint', 'Sushi Restaurant',
       'Liquor Store', 'Pet Store', 'Burrito Place', 'Movie Theater',
       'Fast Food Restaurant', 'Board Shop', 'Pizza Place',
       'Light Rail Station', 'Intersection', 'Hotel', 'Sandwich Place',
       'Bookstore', 'Fish Market', 'Café', 'Bakery', 'Cheese Shop',
       'Gay Bar', 'Seafood Restaurant', 'Thai Restaurant',
       'Comfort Food Restaurant', 'American Restaurant',
       'Latin American Restaurant', 'Gastropub', 'Bar', 'Clothing Store',
       'Bank', 'Convenience Store', 'Thrift / Vintage Store', 'Lawyer'],
      dtype=object)

### Analysing Each Area

In [39]:
toronto_Top_Area = pd.get_dummies(Places_df[['VenueCategory']], prefix="", prefix_sep="")

toronto_Top_Area['PostalCode'] = Places_df['PostalCode'] 
toronto_Top_Area['Borough'] = Places_df['Borough'] 
toronto_Top_Area['Neighborhoods'] = Places_df['Neighborhood'] 

One_columns = list(toronto_Top_Area.columns[-3:]) + list(toronto_Top_Area.columns[:-3])
toronto_Top_Area = toronto_Top_Area[One_columns]

print(toronto_Top_Area.shape)
toronto_Top_Area.head()

(936, 193)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Adult Boutique,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Art Gallery,...,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4K,East Toronto,"The Danforth West, Riverdale",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4K,East Toronto,"The Danforth West, Riverdale",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Groupby Frequencies of Occurence

In [40]:
Top_Area_grouped = toronto_Top_Area.groupby(["PostalCode", "Borough", "Neighborhoods"]).mean().reset_index()

print(Top_Area_grouped.shape)
Top_Area_grouped

(38, 193)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Adult Boutique,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Art Gallery,...,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,East Toronto,"The Danforth West, Riverdale",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478
2,M4L,East Toronto,"The Beaches West, India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,East Toronto,Studio District,0.0,0.0,0.0,0.0,0.0,0.037037,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4N,Central Toronto,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,Central Toronto,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,Central Toronto,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M4S,Central Toronto,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,Central Toronto,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Top 10 places in each area

In [41]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = Top_Area_grouped['PostalCode']
neighborhoods_venues_sorted['Borough'] = Top_Area_grouped['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = Top_Area_grouped['Neighborhoods']

for ind in np.arange(Top_Area_grouped.shape[0]):
    row_categories = Top_Area_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted

(38, 13)


Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,Park,Other Great Outdoors,Trail,Dog Run,Fish & Chips Shop,Festival,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Ethiopian Restaurant
1,M4K,East Toronto,"The Danforth West, Riverdale",Greek Restaurant,Restaurant,Ice Cream Shop,Yoga Studio,Italian Restaurant,Cosmetics Shop,Dessert Shop,Diner,Coffee Shop,Pub
2,M4L,East Toronto,"The Beaches West, India Bazaar",Park,Ice Cream Shop,Burger Joint,Fish & Chips Shop,Light Rail Station,Liquor Store,Italian Restaurant,Board Shop,Sushi Restaurant,Intersection
3,M4M,East Toronto,Studio District,Coffee Shop,Italian Restaurant,Café,Comfort Food Restaurant,Gastropub,Fish Market,Seafood Restaurant,Latin American Restaurant,Cheese Shop,Clothing Store
4,M4N,Central Toronto,Lawrence Park,Photography Studio,Lawyer,Gym / Fitness Center,Yoga Studio,Dumpling Restaurant,Fish Market,Fish & Chips Shop,Festival,Fast Food Restaurant,Farmers Market
5,M4P,Central Toronto,Davisville North,Convenience Store,Gym,Flower Shop,Breakfast Spot,Harbor / Marina,Gym / Fitness Center,Fish Market,Fish & Chips Shop,Festival,Fast Food Restaurant
6,M4R,Central Toronto,North Toronto West,Boutique,Food & Drink Shop,Food,Flower Shop,Fish Market,Fish & Chips Shop,Festival,Fast Food Restaurant,Farmers Market,Falafel Restaurant
7,M4S,Central Toronto,Davisville,Dessert Shop,Café,Toy / Game Store,Italian Restaurant,Coffee Shop,Pizza Place,Indian Restaurant,Thai Restaurant,Diner,Costume Shop
8,M4T,Central Toronto,"Moore Park, Summerhill East",Home Service,Park,Beer Store,Dumpling Restaurant,Fish Market,Fish & Chips Shop,Festival,Fast Food Restaurant,Farmers Market,Falafel Restaurant
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",Coffee Shop,Athletics & Sports,Liquor Store,Supermarket,Empanada Restaurant,Flower Shop,Fish Market,Fish & Chips Shop,Festival,Fast Food Restaurant


### Clustering using K-means

In [42]:
kclusters = 5

Top_venues_clustering = Top_Area_grouped.drop(["PostalCode", "Borough", "Neighborhoods"], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Top_venues_clustering)

kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 2, 0, 0, 0], dtype=int32)

In [47]:
merged = NB_df_new.copy()

merged["Cluster Labels"] = kmeans

merged = merged.join(neighborhoods_venues_sorted.drop(["Borough", "Neighborhoods"], 1).set_index("PostalCode"), on="PostalCode")

print(merged.shape)
merged.head()

(103, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,"KMeans(algorithm='auto', copy_x=True, init='k-...",,,,,,,,,,
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,"KMeans(algorithm='auto', copy_x=True, init='k-...",,,,,,,,,,
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,"KMeans(algorithm='auto', copy_x=True, init='k-...",,,,,,,,,,
3,M1G,Scarborough,Woburn,43.770992,-79.216917,"KMeans(algorithm='auto', copy_x=True, init='k-...",,,,,,,,,,
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,"KMeans(algorithm='auto', copy_x=True, init='k-...",,,,,,,,,,


In [50]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(merged['Latitude'], merged['Longitude'], merged['PostalCode'], merged['Borough'], merged['Neighborhood'], merged['Cluster Labels']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow,
        fill=True,
        fill_color=rainbow,
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters