In [7]:
#conda install -c anaconda xlrd --yes
import numpy as np
import pandas as pd
import requests
import folium
import lxml

from bs4 import BeautifulSoup
from sklearn.cluster import KMeans

# Lets get the data from the wikipedia and then save it to a fil

In [12]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
parser = BeautifulSoup(website_url,'html.parser')
table = parser.find('table',{'class':'wikitable'})
#print(table.tr.text)
headers="Postcode,Borough,Neighbourhood"
table1=""
for tr in table.find_all('tr'):
    row1=""
    for tds in tr.find_all('td'):
        if not tds.text.strip():
            row1=row1+"Not assigned,"        
        else:
            newText = tds.text.strip().replace(",", " ")
            row1=row1+newText+","                
    row1=row1[:-1:]+"\n"
    table1=table1+row1

In [15]:
file=open("toronto.csv","wb")
file.write(bytes(headers,encoding="ascii",errors="ignore"))
file.write(bytes(table1,encoding="ascii",errors="ignore"))

7136

# The data will be transformed into a data frame and then merge it with the geospatial data

In [None]:
df = pd.read_csv('toronto.csv')
df.columns=["Postcode","Borough","Neighbourhood"]
df=df[df['Borough']!="Not assigned"]
df = df.groupby(['Postcode','Borough'], sort=False).agg( ', '.join)
df['Neighbourhood'] = df['Neighbourhood'].apply(lambda x: df['Borough'] if x == "Not assigned" else x)
df=df.reset_index()

In [17]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.columns=['Postcode','Latitude','Longitude']

In [18]:
df_merge = pd.merge(df,df_geo,on='Postcode')
df_merge.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


In [19]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df_merge.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [21]:
df_merge.groupby('Borough').count()['Neighbourhood']

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
East York            5
Etobicoke           12
Mississauga          1
North York          24
Scarborough         17
West Toronto         6
York                 5
Name: Neighbourhood, dtype: int64

# North York, Downtown Toronto, Scarborough and Etobicoke are the top four borough, we are going to work with them

In [46]:
top_list = ['North York','Downtown Toronto','Scarborough','Etobicoke']
df_top = df_merge[df_merge['Borough'].isin(top_list)]
df_top.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


In [47]:
print(df_top.groupby('Borough').count()['Neighbourhood'])

Borough
Downtown Toronto    19
Etobicoke           12
North York          24
Scarborough         17
Name: Neighbourhood, dtype: int64


In [48]:
boroughs = df_top['Borough'].unique().tolist()
print(boroughs)

['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough']


In [49]:
latitude  = df_top['Latitude'].mean()
longitude  = df_top['Longitude'].mean()
print('The geographical coordinates of Toronto are {}, {}'.format(latitude, longitude))

The geographical coordinates of Toronto are 43.71390571527778, -79.39359598888889


In [50]:
borough_color = {}
for borough in boroughs:
    borough_color[borough]= '#%02X%02X%02X' % tuple(np.random.choice(range(256), size=3))
print(borough_color)

{'North York': '#634C3C', 'Downtown Toronto': '#5DBACB', 'Etobicoke': '#5D76A1', 'Scarborough': '#653210'}


# Lets check the map to see the top four Borough

In [51]:
map = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, Neighbourhood in zip(df_top['Latitude'], df_top['Longitude'], df_top['Borough'], df_top['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        #color='red',
        color=borough_color[borough],
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map)  
    
map

In [52]:
CLIENT_ID = 'N3QQK4M0KG52XEBDLLUVWOXYHF1NYBOSHPF4USIIHPZA0DKW' 
CLIENT_SECRET = 'F24FVREP1JZ3DB5SSLYNCYU1OCXLE5TPCUPZ5F5JUE2OJYMX' 
VERSION = '20180605' 
LIMIT = 100 
radius = 500 

In [54]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)                    
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
                    
        results = requests.get(url).json()["response"]['groups'][0]['items']
                
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
   
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

# Lets get the near by venues

In [56]:
venues = getNearbyVenues(names=df_top['Neighbourhood'],latitudes=df_top['Latitude'],longitudes=df_top['Longitude'])

Parkwoods
Victoria Village
Regent Park / Harbourfront
Lawrence Manor / Lawrence Heights
Queen's Park / Ontario Provincial Government
Islington Avenue
Malvern / Rouge
Don Mills
Garden District  Ryerson
Glencairn
West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale
Rouge Hill / Port Union / Highland Creek
Don Mills
St. James Town
Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Wood
Guildwood / Morningside / West Hill
Berczy Park
Woburn
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor / Wilson Heights / Downsview North
Richmond / Adelaide / King
Scarborough Village
Fairview / Henry Farm / Oriole
Northwood Park / York University
Harbourfront East / Union Station / Toronto Islands
Kennedy Park / Ionview / East Birchmount Park
Bayview Village
Downsview
Toronto Dominion Centre / Design Exchange
Golden Mile / Clairlea / Oakridge
York Mills / Silver Hills
Downsview
Commerce Court / Victoria Hotel
North Park / Maple Leaf Park / Upwood Park


In [61]:
venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
Alderwood / Long Branch,8,8,8,8,8,8
Bathurst Manor / Wilson Heights / Downsview North,20,20,20,20,20,20
Bayview Village,4,4,4,4,4,4
Bedford Park / Lawrence Manor East,25,25,25,25,25,25
...,...,...,...,...,...,...
Wexford / Maryvale,6,6,6,6,6,6
Willowdale,39,39,39,39,39,39
Woburn,4,4,4,4,4,4
York Mills / Silver Hills,1,1,1,1,1,1


In [79]:
venues['Venue Category'].unique()

array(['Park', 'Food & Drink Shop', 'Hockey Arena', 'Coffee Shop',
       'Portuguese Restaurant', 'Pizza Place', 'Bakery',
       'Distribution Center', 'Spa', 'Breakfast Spot', 'Restaurant',
       'Historic Site', 'Pub', 'Farmers Market', 'Chocolate Shop',
       'Dessert Shop', 'Theater', 'Performing Arts Venue',
       'Gym / Fitness Center', 'French Restaurant', 'Café',
       'Mexican Restaurant', 'Event Space', 'Yoga Studio',
       'Ice Cream Shop', 'Shoe Store', 'Art Gallery', 'Cosmetics Shop',
       'Electronics Store', 'Bank', 'Beer Store', 'Hotel',
       'Health Food Store', 'Wine Shop', 'Antique Shop', 'Boutique',
       'Furniture / Home Store', 'Vietnamese Restaurant',
       'Clothing Store', 'Accessories Store', "Women's Store",
       'Arts & Crafts Store', 'Miscellaneous Shop', 'Italian Restaurant',
       'Beer Bar', 'Creperie', 'Burrito Place', 'Diner', 'Hobby Shop',
       'Discount Store', 'Fried Chicken Joint', 'Burger Joint',
       'Juice Bar', 'Sandwich Pl

# There are a lot of posibillities, I want to explore coffee shops

Now we are preprocessing the data in order to onehot code and get means values

In [81]:
to_onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
to_onehot['Neighborhoods'] = venues['Neighborhood'] 
fixed_columns = [to_onehot.columns[-1]] + list(to_onehot.columns[:-1])
to_onehot = to_onehot[fixed_columns]
to_grouped = to_onehot.groupby(["Neighborhoods"]).mean().reset_index()
to_grouped

Unnamed: 0,Neighborhoods,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0
1,Alderwood / Long Branch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0
2,Bathurst Manor / Wilson Heights / Downsview North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.05,0.000000,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0
4,Bedford Park / Lawrence Manor East,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,Wexford / Maryvale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0
59,Willowdale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.025641,0.0,0.0,0.0,0.0,0.0
60,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0
61,York Mills / Silver Hills,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0


# We have the data ready to apply kmeans

Now lets set the data grouped by Neighborhoods and Coffee Shop

In [82]:
to_coffee = to_grouped[["Neighborhoods","Coffee Shop"]]
to_coffee

Unnamed: 0,Neighborhoods,Coffee Shop
0,Agincourt,0.000000
1,Alderwood / Long Branch,0.125000
2,Bathurst Manor / Wilson Heights / Downsview North,0.100000
3,Bayview Village,0.000000
4,Bedford Park / Lawrence Manor East,0.080000
...,...,...
58,Wexford / Maryvale,0.000000
59,Willowdale,0.076923
60,Woburn,0.500000
61,York Mills / Silver Hills,0.000000


In [83]:
toclusters = 3
to_clustering = to_coffee.drop(["Neighborhoods"], 1)
kmeans = KMeans(n_clusters=toclusters)
kmeans.fit_transform(to_clustering)
kmeans.labels_[0:20]

array([0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1])

In [84]:
to_merged = to_coffee.copy()
to_merged["Cluster Labels"] = kmeans.labels_
to_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
to_merged.head()

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels
0,Agincourt,0.0,0
1,Alderwood / Long Branch,0.125,1
2,Bathurst Manor / Wilson Heights / Downsview North,0.1,1
3,Bayview Village,0.0,0
4,Bedford Park / Lawrence Manor East,0.08,1


In [85]:
to_merged = to_merged.join(venues.set_index("Neighborhood"), on="Neighborhood")
print(to_merged.shape)
to_merged.head()

(1638, 9)


Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Agincourt,0.0,0,43.7942,-79.262029,Panagio's Breakfast & Lunch,43.79237,-79.260203,Breakfast Spot
0,Agincourt,0.0,0,43.7942,-79.262029,El Pulgarcito,43.792648,-79.259208,Latin American Restaurant
0,Agincourt,0.0,0,43.7942,-79.262029,Twilight,43.791999,-79.258584,Lounge
0,Agincourt,0.0,0,43.7942,-79.262029,Mark's,43.791179,-79.259714,Clothing Store
0,Agincourt,0.0,0,43.7942,-79.262029,Commander Arena,43.794867,-79.267989,Skating Rink


In [86]:
to_merged.sort_values(["Cluster Labels"], inplace=True)
to_merged.head()

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Agincourt,0.0,0,43.7942,-79.262029,Panagio's Breakfast & Lunch,43.79237,-79.260203,Breakfast Spot
32,Lawrence Manor / Lawrence Heights,0.0625,0,43.718518,-79.464763,Lac Vien Vietnamese Restaurant,43.721259,-79.468472,Vietnamese Restaurant
32,Lawrence Manor / Lawrence Heights,0.0625,0,43.718518,-79.464763,Orfus Road Shopping Outlets,43.719045,-79.460849,Clothing Store
32,Lawrence Manor / Lawrence Heights,0.0625,0,43.718518,-79.464763,Tim Hortons,43.719427,-79.467995,Coffee Shop
32,Lawrence Manor / Lawrence Heights,0.0625,0,43.718518,-79.464763,Ardene Shoes Outlet,43.718892,-79.461344,Accessories Store


In [87]:
map_clusters = folium.Map(location=[lat_toronto, lon_toronto],zoom_start=14)

# set color scheme for the clusters


# add markers to the map
markers_colors={}
markers_colors[0] = 'red'
markers_colors[1] = 'blue'
markers_colors[2] = 'green'
markers_colors[3] = 'yellow'
markers_colors[4] = 'cyan'
markers_colors[5] = 'black'
for lat, lon, cluster in zip(to_merged['Neighborhood Latitude'], to_merged['Neighborhood Longitude'], to_merged['Cluster Labels']):
    
    
    folium.features.CircleMarker(
        [lat, lon],
        radius=5,
       
        color =markers_colors[cluster],
        fill_color=markers_colors[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [88]:
to_merged.loc[(to_merged['Cluster Labels'] ==0) & (to_merged['Venue Category'] == 'Coffee Shop') ]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
32,Lawrence Manor / Lawrence Heights,0.0625,0,43.718518,-79.464763,Tim Hortons,43.719427,-79.467995,Coffee Shop
30,Kensington Market / Chinatown / Grange Park,0.064516,0,43.653206,-79.400049,Tim Hortons,43.653303,-79.405457,Coffee Shop
30,Kensington Market / Chinatown / Grange Park,0.064516,0,43.653206,-79.400049,I Deal Coffee,43.655058,-79.403254,Coffee Shop
30,Kensington Market / Chinatown / Grange Park,0.064516,0,43.653206,-79.400049,Dark Horse Espresso Bar,43.650564,-79.397018,Coffee Shop
30,Kensington Market / Chinatown / Grange Park,0.064516,0,43.653206,-79.400049,Little Pebbles,43.654883,-79.400264,Coffee Shop
51,Steeles West / L'Amoreaux West,0.0625,0,43.799525,-79.318389,Tim Hortons,43.799102,-79.318715,Coffee Shop
50,St. James Town / Cabbagetown,0.069767,0,43.667967,-79.367675,Jetfuel Coffee,43.665295,-79.368335,Coffee Shop
50,St. James Town / Cabbagetown,0.069767,0,43.667967,-79.367675,Tim Hortons,43.667169,-79.368849,Coffee Shop
50,St. James Town / Cabbagetown,0.069767,0,43.667967,-79.367675,Tim Hortons,43.665786,-79.368284,Coffee Shop
49,St. James Town,0.058824,0,43.651494,-79.375418,Everyday Gourmet (Teas & Coffees),43.648757,-79.371645,Coffee Shop


In [89]:
to_merged.loc[(to_merged['Cluster Labels'] ==1) & (to_merged['Venue Category'] == 'Coffee Shop') ]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
9,Central Bay Street,0.187500,1,43.657952,-79.387383,Jimmy's Coffee,43.658421,-79.385613,Coffee Shop
9,Central Bay Street,0.187500,1,43.657952,-79.387383,Starbucks,43.659456,-79.390411,Coffee Shop
9,Central Bay Street,0.187500,1,43.657952,-79.387383,Hailed Coffee,43.658833,-79.383684,Coffee Shop
9,Central Bay Street,0.187500,1,43.657952,-79.387383,Neo Coffee Bar,43.660140,-79.385870,Coffee Shop
44,Richmond / Adelaide / King,0.082474,1,43.650571,-79.384568,HotBlack Coffee,43.650364,-79.388669,Coffee Shop
...,...,...,...,...,...,...,...,...,...
25,Harbourfront East / Union Station / Toronto Is...,0.120000,1,43.640816,-79.381752,Mos Mos,43.641640,-79.377552,Coffee Shop
25,Harbourfront East / Union Station / Toronto Is...,0.120000,1,43.640816,-79.381752,Lavazza Espression,43.639537,-79.381763,Coffee Shop
25,Harbourfront East / Union Station / Toronto Is...,0.120000,1,43.640816,-79.381752,Pilot Coffee Roasters,43.645018,-79.380415,Coffee Shop
25,Harbourfront East / Union Station / Toronto Is...,0.120000,1,43.640816,-79.381752,Balzac's Coffee,43.644373,-79.383065,Coffee Shop


# Analysis

Cluster 1 has most of the Coffee shops in town  
Cluster 0 seems a good location, there are few Coffee shops.