## Importing Libraries

In [11]:
import pandas as pd
import numpy as np
import folium

## Scraping Data from the wiki page

In [2]:
from pandas.io.html import read_html
link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikitables=read_html(link,attrs = {'class': 'wikitable'})

In [3]:
canada1=wikitables[0]

In [4]:
canada1=canada1[canada1['Borough']!='Not assigned'].reset_index(drop=True)
canada1=canada1.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()

In [155]:
canada1.shape

(103, 3)

In [165]:
canada2=wikitables[0]
canada2=canada2[canada2['Borough']!='Not assigned'].reset_index(drop=True)


In [169]:
canada2['Neighbourhood'].nunique()

208

In [156]:
canada1.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


## Reading Geospatial data

In [6]:
lat_long=pd.read_csv('Geospatial_Coordinates.csv')
lat_long.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


# Merging the geo spatial data with canada1 data frame

In [7]:
canada1=canada1.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()

In [8]:
canada=pd.merge(canada1,lat_long,how='inner',left_on='Postcode',right_on='Postal Code')[['Postcode','Borough','Neighbourhood','Latitude','Longitude']]
canada.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


## Getting only Totonto data

In [28]:
#toronto=canada[(canada['Borough']=='West Toronto')|(canada['Borough']=='Downtown Toronto')|(canada['Borough']=='East Toronto')|(canada['Borough']=='Central Toronto')]
toronto=toronto.reset_index(drop=True)
toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


## Creating a map to see how the neighbourhoods are clustered in toronto

In [12]:
# create map of New York using latitude and longitude values
map_canada = folium.Map(location=[43.6532,-79.3832], zoom_start=10)

# add markers to map
for Latitude,Longitude, Borough, Neighbourhood in zip(toronto['Latitude'],toronto['Longitude'],toronto['Borough'], toronto['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, Borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [Latitude, Longitude],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_canada)  
    
map_canada

## Define Foursquare credentials

In [16]:
CLIENT_ID = '531ZWTMMGE1X44QESSWGTCZI1DBQS51HE2HT1LS5Z0SFPOAK' # your Foursquare ID
CLIENT_SECRET = '0TA3HPRQQEC3Z4CI5TOAISVNPPG2GYK2EJNOHX00CXAEWO0X' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 531ZWTMMGE1X44QESSWGTCZI1DBQS51HE2HT1LS5Z0SFPOAK
CLIENT_SECRET:0TA3HPRQQEC3Z4CI5TOAISVNPPG2GYK2EJNOHX00CXAEWO0X


In [29]:
toronto.loc[0,'Neighbourhood']

'The Beaches'

In [33]:
neighbourhood_latitude=toronto.loc[0,'Latitude']
neighbourhood_longitude=toronto.loc[0,'Longitude']
neighbourhood_name=toronto.loc[0,'Neighbourhood']
print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


## Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.

## creating get request:

In [35]:
LIMIT = 100 
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url 

'https://api.foursquare.com/v2/venues/explore?&client_id=531ZWTMMGE1X44QESSWGTCZI1DBQS51HE2HT1LS5Z0SFPOAK&client_secret=0TA3HPRQQEC3Z4CI5TOAISVNPPG2GYK2EJNOHX00CXAEWO0X&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [38]:
import requests

In [39]:
results1=requests.get(url).json()
results1

{'meta': {'code': 200, 'requestId': '5e6344d0882fc7001bd00598'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 5,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'labe

##  creating a function that extracts the category of the venue

In [40]:

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [43]:
from pandas.io.json import json_normalize 

In [44]:
venues = results1['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869
4,Seaspray Restaurant,Asian Restaurant,43.678888,-79.298167


In [45]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

5 venues were returned by Foursquare.


## Exploring the Neighborhoods in Torronto

In [47]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Creating a dataframe called toronto_venues

In [49]:
toronto_venues = getNearbyVenues(names=toronto['Neighbourhood'],
                                   latitudes=toronto['Latitude'],
                                   longitudes=toronto['Longitude']
                                  )

The Beaches
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie
Dovercourt Village,Dufferin
Little Portugal,Trinity
Brockton,Exhibition Place,Parkdale Village
High Park,The Junction South
Parkdale,Roncesvalles
Runnymede

In [50]:
print(toronto_venues.shape)
toronto_venues.head()

(1728, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,The Beaches,43.676357,-79.293031,Seaspray Restaurant,43.678888,-79.298167,Asian Restaurant


In [51]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
Berczy Park,56,56,56,56,56,56
"Brockton,Exhibition Place,Parkdale Village",24,24,24,24,24,24
Business Reply Mail Processing Centre 969 Eastern,18,18,18,18,18,18
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",17,17,17,17,17,17
"Cabbagetown,St. James Town",47,47,47,47,47,47
Central Bay Street,79,79,79,79,79,79
"Chinatown,Grange Park,Kensington Market",86,86,86,86,86,86
Christie,18,18,18,18,18,18
Church and Wellesley,85,85,85,85,85,85


In [52]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 237 uniques categories.


## Applying one-hot encoding:

In [99]:
# one hot encoding
toront_onehott = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toront_onehott['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toront_onehott.columns[-1]] + list(toront_onehott.columns[:-1])
toront_onehott = toront_onehott[fixed_columns]

toront_onehott.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [110]:
toront_onehott#['Neighbourhood']=toronto_venues['Neighborhood']

Unnamed: 0,Neighbourhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,"The Danforth West,Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,"The Danforth West,Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,"The Danforth West,Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,"The Danforth West,Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,"The Danforth West,Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [109]:
fixed_columns = [toront_onehott.columns[-1]] + list(toront_onehott.columns[:-1])
toront_onehott = toront_onehott[fixed_columns]

In [111]:
toront_onehott.shape

(1728, 238)

## Grouping the neighborhoods and taking the mean of frequency of occurance of each category

In [112]:
toronto_grouped = toront_onehott.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.01
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village",0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021277,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012658,...,0.0,0.0,0.0,0.012658,0.0,0.0,0.012658,0.0,0.0,0.0
7,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.034884,0.0,0.05814,0.011628,0.0,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.011765,0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.011765,...,0.0,0.0,0.0,0.0,0.0,0.011765,0.0,0.011765,0.011765,0.0


In [113]:
toronto_grouped.shape

(39, 237)

## Top 5 most common venues in each neighborhood 

In [114]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
              venue  freq
0       Coffee Shop  0.07
1        Restaurant  0.05
2   Thai Restaurant  0.04
3              Café  0.04
4  Sushi Restaurant  0.03


----Berczy Park----
            venue  freq
0     Coffee Shop  0.09
1      Restaurant  0.04
2  Farmers Market  0.04
3          Bakery  0.04
4        Beer Bar  0.04


----Brockton,Exhibition Place,Parkdale Village----
            venue  freq
0            Café  0.12
1     Coffee Shop  0.08
2  Breakfast Spot  0.08
3          Bakery  0.08
4     Yoga Studio  0.04


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0         Yoga Studio  0.06
1                 Spa  0.06
2       Garden Center  0.06
3              Garden  0.06
4  Light Rail Station  0.06


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
              venue  freq
0   Airport Service  0.18
1    Airport Lounge  0.12
2  Airport Terminal  0.12
3   Har

## Creating a data frame for the above top 5 venues of each neighborhood

In [115]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [118]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Restaurant,Thai Restaurant,Café,Sushi Restaurant,Bar,Lounge,Steakhouse,Cosmetics Shop,Bakery
1,Berczy Park,Coffee Shop,Bakery,Cheese Shop,Beer Bar,Seafood Restaurant,Farmers Market,Cocktail Bar,Café,Restaurant,Breakfast Spot
2,"Brockton,Exhibition Place,Parkdale Village",Café,Bakery,Breakfast Spot,Coffee Shop,Yoga Studio,Stadium,Burrito Place,Restaurant,Climbing Gym,Pet Store
3,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Auto Workshop,Park,Comic Shop,Pizza Place,Recording Studio,Restaurant,Burrito Place,Brewery,Light Rail Station
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Service,Airport Lounge,Airport Terminal,Sculpture Garden,Boutique,Bar,Boat or Ferry,Plane,Coffee Shop,Harbor / Marina


## Filtering out the Coffee Shops

In [172]:
to_coffee=toronto_grouped[['Neighborhood','Coffee Shop']]

## Cluster Neighbourhoods

In [120]:
from sklearn.cluster import KMeans

In [173]:
# set number of clusters
kclusters = 5

to_coffee_clustering = to_coffee.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [174]:
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       4, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [176]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
to_coffee_merged = to_coffee.copy()

# add clustering labels
to_coffee_merged["Cluster Labels"] = kmeans.labels_

In [187]:
to_coffee_merged.rename(columns={"Neighborhood": "Neighbourhood"}, inplace=True)
to_coffee_merged.head()

Unnamed: 0,Neighbourhood,Coffee Shop,Cluster Labels
0,"Adelaide,King,Richmond",0.07,0
1,Berczy Park,0.089286,0
2,"Brockton,Exhibition Place,Parkdale Village",0.083333,0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.058824,0


In [191]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
to_coffee_merged = to_coffee_merged.join(toronto.set_index('Neighbourhood'),on="Neighbourhood")

print(to_coffee_merged.shape)
to_coffee_merged.head() # check the last columns!

(39, 7)


Unnamed: 0,Neighbourhood,Coffee Shop,Cluster Labels,Postcode,Borough,Latitude,Longitude
0,"Adelaide,King,Richmond",0.07,0,M5H,Downtown Toronto,43.650571,-79.384568
1,Berczy Park,0.089286,0,M5E,Downtown Toronto,43.644771,-79.373306
2,"Brockton,Exhibition Place,Parkdale Village",0.083333,0,M6K,West Toronto,43.636847,-79.428191
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0,M7Y,East Toronto,43.662744,-79.321558
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.058824,0,M5V,Downtown Toronto,43.628947,-79.39442


In [193]:
##Sort the results by cluster Labels
print(to_coffee_merged.shape)
to_coffee_merged.sort_values(['Cluster Labels'],inplace=True)
to_coffee_merged

(39, 7)


Unnamed: 0,Neighbourhood,Coffee Shop,Cluster Labels,Postcode,Borough,Latitude,Longitude
0,"Adelaide,King,Richmond",0.07,0,M5H,Downtown Toronto,43.650571,-79.384568
36,The Beaches,0.0,0,M4E,East Toronto,43.676357,-79.293031
35,"The Annex,North Midtown,Yorkville",0.090909,0,M5R,Central Toronto,43.67271,-79.405678
34,Studio District,0.069767,0,M4M,East Toronto,43.659526,-79.340923
33,Stn A PO Boxes 25 The Esplanade,0.115789,0,M5W,Downtown Toronto,43.646435,-79.374846
32,St. James Town,0.08,0,M5C,Downtown Toronto,43.651494,-79.375418
31,"Ryerson,Garden District",0.08,0,M5B,Downtown Toronto,43.657162,-79.378937
30,"Runnymede,Swansea",0.085714,0,M6S,West Toronto,43.651571,-79.48445
27,Queen's Park,0.232558,0,M7A,Downtown Toronto,43.662301,-79.389494
26,"Parkdale,Roncesvalles",0.071429,0,M6R,West Toronto,43.64896,-79.456325


### Finally lets Visulaize the clusters

In [194]:
# create map
map_clusters_cof = folium.Map(location=[43.653963, -79.387207], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(to_coffee_merged['Latitude'], to_coffee_merged['Longitude'], to_coffee_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters_cof)
       
map_clusters_cof

In [195]:
#daving the map as html
map_clusters_cof.save('map_clusters_cof.html')

## Examining the Clusters.

# Cluster 0

In [199]:
to_coffee_merged.loc[to_coffee_merged['Cluster Labels']==0]

Unnamed: 0,Neighbourhood,Coffee Shop,Cluster Labels,Postcode,Borough,Latitude,Longitude
0,"Adelaide,King,Richmond",0.07,0,M5H,Downtown Toronto,43.650571,-79.384568
36,The Beaches,0.0,0,M4E,East Toronto,43.676357,-79.293031
35,"The Annex,North Midtown,Yorkville",0.090909,0,M5R,Central Toronto,43.67271,-79.405678
34,Studio District,0.069767,0,M4M,East Toronto,43.659526,-79.340923
33,Stn A PO Boxes 25 The Esplanade,0.115789,0,M5W,Downtown Toronto,43.646435,-79.374846
32,St. James Town,0.08,0,M5C,Downtown Toronto,43.651494,-79.375418
31,"Ryerson,Garden District",0.08,0,M5B,Downtown Toronto,43.657162,-79.378937
30,"Runnymede,Swansea",0.085714,0,M6S,West Toronto,43.651571,-79.48445
27,Queen's Park,0.232558,0,M7A,Downtown Toronto,43.662301,-79.389494
26,"Parkdale,Roncesvalles",0.071429,0,M6R,West Toronto,43.64896,-79.456325


In [204]:
to_coffee_merged.loc[to_coffee_merged['Cluster Labels']==0]['Borough'].value_counts()

Downtown Toronto    18
West Toronto         6
Central Toronto      5
East Toronto         5
Name: Borough, dtype: int64

## Cluster 1

In [200]:
to_coffee_merged.loc[to_coffee_merged['Cluster Labels']==1]

Unnamed: 0,Neighbourhood,Coffee Shop,Cluster Labels,Postcode,Borough,Latitude,Longitude
28,Rosedale,0.0,1,M4W,Downtown Toronto,43.679563,-79.377529
24,"Moore Park,Summerhill East",0.0,1,M4T,Central Toronto,43.689574,-79.38316


## Cluster 2

In [201]:
to_coffee_merged.loc[to_coffee_merged['Cluster Labels']==2]

Unnamed: 0,Neighbourhood,Coffee Shop,Cluster Labels,Postcode,Borough,Latitude,Longitude
29,Roselawn,0.0,2,M5N,Central Toronto,43.711695,-79.416936


## Cluster 3

In [202]:
to_coffee_merged.loc[to_coffee_merged['Cluster Labels']==3]

Unnamed: 0,Neighbourhood,Coffee Shop,Cluster Labels,Postcode,Borough,Latitude,Longitude
17,"Forest Hill North,Forest Hill West",0.0,3,M5P,Central Toronto,43.696948,-79.411307


## Cluster 4

In [203]:
to_coffee_merged.loc[to_coffee_merged['Cluster Labels']==4]

Unnamed: 0,Neighbourhood,Coffee Shop,Cluster Labels,Postcode,Borough,Latitude,Longitude
22,Lawrence Park,0.0,4,M4N,Central Toronto,43.72802,-79.38879


## Obsevations:

Most of the coffee shops are concentrated in Downtown area of Toronto, with moderate number in cluster0 and there are no coffee shops in clusters 1, 2, 3, 4.So this clusters represents a great opportunity and are high potential areas to open a new coffee shop outlet as there is no competition from existing coffee shops. Meanwhile the coffee shops in cluster0 are facing moderate competition from the existing coffee shops. Therefore this project recommends the budding entrepreneurs to open coffee shops in the clusters 1,2,3,4 where there is a high chance of growing business as there are no coffee shops. The entrepreneurs are not advised not to start in cluster0 as it has moderate concentration of coffee shops and the competition will be there from the existing shops. Entrepreneurs can even setup in cluster0 with some innovative thoughts and can try to sustain the competition.