In [2]:
import pandas as pd
import numpy as np

In [3]:
!conda install -c conda-forge geopy --yes

Solving environment: ...working... done

# All requested packages already installed.



In [4]:
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: ...working... done

# All requested packages already installed.



In [5]:
#install Beautiful Soup
!conda install -c conda-forge beautifulsoup4 --yes

Solving environment: ...working... done

# All requested packages already installed.



In [6]:
#install XML parser
!conda install -c conda-forge lxml --yes

Solving environment: ...working... done

# All requested packages already installed.



In [7]:
#install HMTL5LIB
!conda install -c conda-forge html5lib --yes

Solving environment: ...working... done

# All requested packages already installed.



In [8]:
#install requests library
!conda install -c conda-forge requests --yes

Solving environment: ...working... done

# All requested packages already installed.



In [9]:
#import packages needed
from bs4 import BeautifulSoup
import requests

In [10]:
#define URL

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#get request object

source = requests.get(url).text

In [11]:
#define variable 

soup = BeautifulSoup(source, 'lxml')

print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":876823784,"wgRevisionId":876823784,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

In [12]:
#define table

table = soup.find('table')
print(table.prettify())

<table class="wikitable sortable">
 <tbody>
  <tr>
   <th>
    Postcode
   </th>
   <th>
    Borough
   </th>
   <th>
    Neighbourhood
   </th>
  </tr>
  <tr>
   <td>
    M1A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M2A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M3A
   </td>
   <td>
    <a href="/wiki/North_York" title="North York">
     North York
    </a>
   </td>
   <td>
    <a href="/wiki/Parkwoods" title="Parkwoods">
     Parkwoods
    </a>
   </td>
  </tr>
  <tr>
   <td>
    M4A
   </td>
   <td>
    <a href="/wiki/North_York" title="North York">
     North York
    </a>
   </td>
   <td>
    <a href="/wiki/Victoria_Village" title="Victoria Village">
     Victoria Village
    </a>
   </td>
  </tr>
  <tr>
   <td>
    M5A
   </td>
   <td>
    <a href="/wiki/Downtown_Toronto" title="Downtown Toronto">
     Downtown Toronto
    </a>
   </td>
   <td>
    <a href="

# Create dataframe

In [13]:
#find the values of the headers used for the dataframe

table = soup.find('table')

table_columns = []
for items in table.tr:
    if 'Tag' in str(type(items)):
        table_columns.append(items.text.rstrip())
table_columns

['Postcode', 'Borough', 'Neighbourhood']

In [14]:
#read in table contents and add to DF

table_columns[2] = 'Neighborhood'

In [15]:
table_contents = []
for items in table.tbody:
    if 'Tag' in str(type(items)):
        for i in items:
            if 'Tag' in str(type(i)): 
                table_contents.append(i.text.rstrip()) # strip values of '/n'

#skip first 3 values as they are table headers
table_contents = table_contents[3:]

#show first 20 
table_contents[:20]

['M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods',
 'M4A',
 'North York',
 'Victoria Village',
 'M5A',
 'Downtown Toronto',
 'Harbourfront',
 'M5A',
 'Downtown Toronto',
 'Regent Park',
 'M6A',
 'North York']

# Convert list to an array of 3 columns

In [16]:
table_content_array = np.array(table_contents).reshape(int(len(table_contents)/3), 3)
table_content_array[0:3]

array([['M1A', 'Not assigned', 'Not assigned'],
       ['M2A', 'Not assigned', 'Not assigned'],
       ['M3A', 'North York', 'Parkwoods']], dtype='<U49')

# Distribute values from array into dataframe

In [17]:
df = pd.DataFrame(columns=table_columns, data=table_content_array)
df.head(11)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


# Data Cleaning/Manipulation
## Remove "Not Assigned" Values from DF

In [18]:
#put string to UPPER to compare text values 
not_assigned = df['Borough'].str.upper() == 'Not assigned'.upper()

print('there are', not_assigned.sum(), 'cases not assigned a borough value')

there are 77 cases not assigned a borough value


In [19]:
#remove values that are not assigned from dataframe. not_assigned values removed with not operator "~"
df = df[~not_assigned]

In [20]:
#define empty lists
postalcode_list, borough_list, neighborhood_list = [], [], []

#grouping data with same postal code
df_groups = df.groupby('Postcode').groups

for postalcode in df_groups.keys():
    postalcode_list.append(postalcode)
    borough_list.append(pd.unique(df.loc[df_groups[postalcode]]['Borough'])[0])
    same_postalcode_neigh = df.loc[df_groups[postalcode]]['Neighborhood']
    neighborhood_list.append(', '.join(same_postalcode_neigh))

### Redistribute the data into a dataframe

In [21]:
df = pd.DataFrame.from_dict({table_columns[0]: postalcode_list, 
                                      table_columns[1]: borough_list,
                                      table_columns[2]: neighborhood_list,})
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# If a cell has a value in the borough, but not in the neighborhood --> assign borough as neighborhood

In [22]:
print((df['Neighborhood'].str.upper() == 'NOT ASSIGNED').sum())

1


In [23]:
# Reassign borough value to value in for updated dataframe

not_assigned_neighborhoods = df['Neighborhood'].str.upper() == 'NOT ASSIGNED'
df.loc[not_assigned_neighborhoods, 'Neighborhood'] = df.loc[not_assigned_neighborhoods, 'Borough']

In [24]:
#view the final dataframe

df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [25]:
#show shape of dataframe

df.shape

(103, 3)

In [26]:
try:
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
    print('Nominatim imported!')
except:
    print('installing geopy')
    !pip install geopy
    print('installed!')
    
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
    print('Nominatim imported!')

Nominatim imported!


In [27]:
#create dataframe to store the location

postalcode_LL_df = pd.DataFrame(columns=['Postal Code', 'Latitude', 'Longitude'])
postalcode_LL_df

Unnamed: 0,Postal Code,Latitude,Longitude


In [28]:
#import csv file

postalcode_LL_df = pd.read_csv('Geospatial_Coordinates.csv')
postalcode_LL_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [29]:
#check to see if there are any mismatches in the csv to the dataframe

mismatch_postalcodes = df['Postcode'] != postalcode_LL_df['Postal Code']
print('there are' , mismatch_postalcodes.sum(), 'mismatched postal codes.')

there are 0 mismatched postal codes.


In [30]:
#assign values from CSV coordinates to datagrame

neighborhoods = df.assign(Latitude = postalcode_LL_df.loc[~mismatch_postalcodes, 'Latitude'],
                                   Longitude= postalcode_LL_df.loc[~mismatch_postalcodes, 'Longitude'])

neighborhoods.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Explore downtown Toronto Area

In [31]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [32]:
#import folium library

import folium

# Plotting Toronto Area

In [33]:
address = 'Toronto, ON'

geolocator = Nominatim()

toronto_location = geolocator.geocode(address)
toronto_latitude = toronto_location.latitude
toronto_longitude = toronto_location.longitude

print(toronto_location)
print('toronto_latitude:', toronto_latitude)
print('toronto_longitude:', toronto_longitude)

  This is separate from the ipykernel package so we can avoid doing imports until


Toronto, Ontario, M6K 1X9, Canada
toronto_latitude: 43.653963
toronto_longitude: -79.387207


In [34]:
#Define plot and create map

def plot_area(area_df, area_latitude, area_longitude, zool_level=10):
    
    area_map = folium.Map(location=[area_latitude, area_longitude], zoom_start=zool_level)

    #add markers to map
    for lat, lng, borough, neighborhood in zip(area_df['Latitude'], area_df['Longitude'], area_df['Borough'], area_df['Neighborhood']):

        label = '{}, ({})'.format(neighborhood, borough)
        label = folium.Popup(label, parse_html=True)

        folium.CircleMarker(
                            [lat, lng],
                            radius=5,
                            popup=label,
                            color='blue',
                            fill=True,
                            fill_color='#3186cc',
                            fill_opacity=0.7,
                            parse_html=False).add_to(area_map)  

    return area_map

## Plot Toronot Area

In [36]:
plot_area(neighborhoods, toronto_latitude, toronto_longitude)

# Analyze Toronto 

In [37]:
downtown_area = neighborhoods['Borough'].apply(lambda x: 'Downtown Toronto'.lower() in x.lower())
print('there are', downtown_area.sum(), 'boroughs with the word \'Toronto\'')

there are 18 boroughs with the word 'Toronto'


In [39]:
# Create dataframe to analyze Toronto

downtown_data = neighborhoods[downtown_area].reset_index(drop=True)
downtown_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [41]:
#Get geographical coordinates of Toronto

address = 'Downtown Toronto, ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

  """


The geograpical coordinate of Downtown Toronto are 43.655115, -79.380219.


In [42]:
#Plot Downtown Toronto with geographical coordinates

plot_area(downtown_data, latitude, longitude, 13)

In [43]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [44]:
# Explore Downtown Toronto Neighborhoods

CLIENT_ID = 'X5IGT1ZRISIECDUTWFJKTGQNBRRWW5RKWFFO3XMYZ5WTDNIY'
CLIENT_SECRET = '11KPPJDTZSVKMZBAYGRGUZLE0ZXI542H4DUFZGWHI5LTGDHB'
VERSION = '20180605'
LIMIT = 100
radius = 500

### Extract category of Venue

In [46]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### A function to explore the nearby venues for a given neighborhood

In [48]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [49]:
# Print only neighborhood names

downtown_venues = getNearbyVenues(names=downtown_data['Neighborhood'],
                                  latitudes=downtown_data['Latitude'],
                                  longitudes=downtown_data['Longitude']
                                  )

Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie


In [52]:
#Check number of venues
print(downtown_venues.shape)
downtown_venues.head()

(1287, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"Cabbagetown, St. James Town",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


In [53]:
#Count how many venues are in each area
downtown_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,54,54,54,54,54,54
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",14,14,14,14,14,14
"Cabbagetown, St. James Town",48,48,48,48,48,48
Central Bay Street,85,85,85,85,85,85
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,16,16,16,16,16,16
Church and Wellesley,88,88,88,88,88,88
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"Design Exchange, Toronto Dominion Centre",100,100,100,100,100,100


In [54]:
#Check how many categories exist
print('There are {} uniques categories.'.format(len(downtown_venues['Venue Category'].unique())))

There are 207 uniques categories.


# Analyze each neighborhood in Downtown Toronto

In [56]:
# one hot encoding
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_onehot['Neighborhood'] = downtown_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]

downtown_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
downtown_onehot.shape

(1287, 207)

### Check frequency of each category 

In [58]:
downtown_grouped = downtown_onehot.groupby('Neighborhood').mean().reset_index()
downtown_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.01
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.011765,0.0,0.0,0.011765,0.0,0.0


In [59]:
downtown_grouped.shape

(18, 207)

In [61]:
#Sort each venue in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Create new dataframe for top ten categories

In [62]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_grouped['Neighborhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,American Restaurant,Steakhouse,Thai Restaurant,Hotel,Bakery,Asian Restaurant,Bar,Clothing Store
1,Berczy Park,Coffee Shop,Restaurant,Cocktail Bar,Pub,Bakery,Seafood Restaurant,Farmers Market,Cheese Shop,Café,Steakhouse
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Terminal,Airport Service,Harbor / Marina,Boutique,Boat or Ferry,Plane,Airport Gate,Airport Food Court,Sculpture Garden
3,"Cabbagetown, St. James Town",Restaurant,Coffee Shop,Pharmacy,Indian Restaurant,Pizza Place,Café,Park,Bakery,Pub,Italian Restaurant
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Bar,Burger Joint,Ice Cream Shop,Indian Restaurant,Sandwich Place,Bubble Tea Shop,Chinese Restaurant
5,"Chinatown, Grange Park, Kensington Market",Café,Bar,Vegetarian / Vegan Restaurant,Dumpling Restaurant,Vietnamese Restaurant,Coffee Shop,Bakery,Mexican Restaurant,Chinese Restaurant,Gaming Cafe
6,Christie,Grocery Store,Café,Park,Athletics & Sports,Italian Restaurant,Baby Store,Diner,Nightclub,Convenience Store,Coffee Shop
7,Church and Wellesley,Japanese Restaurant,Coffee Shop,Sushi Restaurant,Gay Bar,Burger Joint,Restaurant,Bubble Tea Shop,Café,Mediterranean Restaurant,Gastropub
8,"Commerce Court, Victoria Hotel",Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Deli / Bodega,Gastropub,Bakery,Steakhouse,Italian Restaurant
9,"Design Exchange, Toronto Dominion Centre",Coffee Shop,Hotel,Café,Restaurant,American Restaurant,Italian Restaurant,Deli / Bodega,Gym,Gastropub,Steakhouse


# K-Means Clustering 

In [63]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [64]:
#define clusters
kclusters = 5

downtown_grouped_clustering = downtown_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_grouped_clustering)

#check labels for k-means
kmeans.labels_[0:10]

array([0, 4, 1, 4, 0, 0, 3, 0, 0, 0])

In [None]:
#downtown_merged['Cluster Labels'] = kmeans.labels

#downtown_merged = downtown_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

#downtown_merged.head()

In [65]:
downtown_merged = downtown_data

#add clustering labels
downtown_merged['Cluster Labels'] = kmeans.labels_

downtown_merged = downtown_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

downtown_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,0,Park,Playground,Trail,Women's Store,Deli / Bodega,Electronics Store,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675,4,Restaurant,Coffee Shop,Pharmacy,Indian Restaurant,Pizza Place,Café,Park,Bakery,Pub,Italian Restaurant
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,1,Japanese Restaurant,Coffee Shop,Sushi Restaurant,Gay Bar,Burger Joint,Restaurant,Bubble Tea Shop,Café,Mediterranean Restaurant,Gastropub
3,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,4,Coffee Shop,Park,Café,Pub,Bakery,Mexican Restaurant,Breakfast Spot,Restaurant,Theater,Dessert Shop
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Tea Room,Restaurant,Bar,Diner,Plaza


### View Clusters on Map

In [66]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

#set color for clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**3 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#add markerts to map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighborhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' [Cluster ' + str(cluster)+']', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

In [67]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 0, 
                    downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,0,Park,Playground,Trail,Women's Store,Deli / Bodega,Electronics Store,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
4,Downtown Toronto,0,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Tea Room,Restaurant,Bar,Diner,Plaza
5,Downtown Toronto,0,Coffee Shop,Restaurant,Hotel,Café,Cocktail Bar,Park,Gastropub,Italian Restaurant,Bakery,Breakfast Spot
7,Downtown Toronto,0,Coffee Shop,Café,Italian Restaurant,Bar,Burger Joint,Ice Cream Shop,Indian Restaurant,Sandwich Place,Bubble Tea Shop,Chinese Restaurant
8,Downtown Toronto,0,Coffee Shop,Café,American Restaurant,Steakhouse,Thai Restaurant,Hotel,Bakery,Asian Restaurant,Bar,Clothing Store
9,Downtown Toronto,0,Coffee Shop,Aquarium,Hotel,Café,Pizza Place,Brewery,Scenic Lookout,Bakery,Restaurant,Italian Restaurant
10,Downtown Toronto,0,Coffee Shop,Hotel,Café,Restaurant,American Restaurant,Italian Restaurant,Deli / Bodega,Gym,Gastropub,Steakhouse
11,Downtown Toronto,0,Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Deli / Bodega,Gastropub,Bakery,Steakhouse,Italian Restaurant
12,Downtown Toronto,0,Café,Bar,Coffee Shop,Bookstore,Japanese Restaurant,Restaurant,Bakery,Beer Bar,Beer Store,Sandwich Place
15,Downtown Toronto,0,Coffee Shop,Restaurant,Café,Pub,Hotel,Seafood Restaurant,Cocktail Bar,Italian Restaurant,Farmers Market,Japanese Restaurant


In [68]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 1, 
                    downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,1,Japanese Restaurant,Coffee Shop,Sushi Restaurant,Gay Bar,Burger Joint,Restaurant,Bubble Tea Shop,Café,Mediterranean Restaurant,Gastropub


In [69]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 2, 
                    downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,2,Airport Lounge,Airport Terminal,Airport Service,Harbor / Marina,Boutique,Boat or Ferry,Plane,Airport Gate,Airport Food Court,Sculpture Garden


In [70]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 3, 
                    downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Downtown Toronto,3,Coffee Shop,Restaurant,Cocktail Bar,Pub,Bakery,Seafood Restaurant,Farmers Market,Cheese Shop,Café,Steakhouse


In [71]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 4, 
                    downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,4,Restaurant,Coffee Shop,Pharmacy,Indian Restaurant,Pizza Place,Café,Park,Bakery,Pub,Italian Restaurant
3,Downtown Toronto,4,Coffee Shop,Park,Café,Pub,Bakery,Mexican Restaurant,Breakfast Spot,Restaurant,Theater,Dessert Shop
13,Downtown Toronto,4,Café,Bar,Vegetarian / Vegan Restaurant,Dumpling Restaurant,Vietnamese Restaurant,Coffee Shop,Bakery,Mexican Restaurant,Chinese Restaurant,Gaming Cafe
16,Downtown Toronto,4,Café,Coffee Shop,Hotel,Restaurant,American Restaurant,Bakery,Seafood Restaurant,Steakhouse,Bar,Asian Restaurant
17,Downtown Toronto,4,Grocery Store,Café,Park,Athletics & Sports,Italian Restaurant,Baby Store,Diner,Nightclub,Convenience Store,Coffee Shop
