# Segmenting and Clustering Neighborhoods in Toronto

### Importing necessary packages

In [1]:
import pandas as pd
import numpy as np
import folium
from geopy.geocoders import Nominatim
import json,requests
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## 1.Setting up the dataframe

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
toronto = pd.read_html(url)[0]

### Saving only the rows where the *Borough* column has an assigned value and checking for duplicate values in the *Postal Code* column

In [3]:
toronto = toronto.loc[toronto['Borough'] != 'Not assigned']
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Checking if all *Postal Code* values are unique

In [4]:
toronto[toronto['Postal Code'].duplicated(keep=False)]

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [5]:
toronto.reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [6]:
toronto.shape

(103, 3)

## 2. Getting Geodata

### Reading provided CSV for geodata as Geocoder Python package did not work

In [7]:
geo_data = pd.read_csv('Geospatial_Coordinates.csv')
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Adding lat,long data to our Toronto dataframe

In [8]:
toronto = pd.merge(toronto,geo_data)
toronto.shape

(103, 5)

In [9]:
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## 3. Analyzing Toronto data

The same analysis as in the previous lab (i.e., clustering based on frequency of venue category in postal codes) was performed

### Create first map of Toronto

I first retrieve the coordinates for Toronto and create a map centered around these. Then circles are added to mark the location of the neighborhoods.

In [10]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
lat = location.latitude
long = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address,lat, long))

The geograpical coordinate of Toronto, Ontario are 43.6534817, -79.3839347.


In [11]:
m = folium.Map(location = [lat,long])

#[row['Latitude'],row['Longitude']],popup=row['Neighbourhood']).add_to(m)
for lat,long,label in zip(toronto['Latitude'],toronto['Longitude'],toronto['Neighbourhood']):
    folium.Circle(
        location = [lat,long],
        popup=folium.Popup(label,parse_html=True),
        radius = 500,
        fill=True,
        parse_html=True).add_to(m)
m

### Create a function that builds the url to perform the get request using the foursquare API

The function is called venue_recs and takes the parameters to build the url, such as latitude and longitude as well as endpoint, query and limit. In order to protect my foursquare user info, I have removed my foursqare credentials from the variables CLIENT_ID and CLIENT_SECRET

In [12]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret

In [13]:
def venue_recs(ll,endpoint='explore',client_id = CLIENT_ID,query=None,client_secret = CLIENT_SECRET,v='20180605',limit=10,radius=500):
    url = f'https://api.foursquare.com/v2/venues/{endpoint}'
    params = dict(
        client_id=client_id,
        client_secret=client_secret,
        v=v,
        ll=ll,
        query=query,
        limit=limit,
        radius = radius)
    
    resp = requests.get(url=url, params=params).json()
    return resp

### Create a function that retrieves venue data 

The function returns an array of rows that can be made into a dataframe 

In [14]:
def get_venues(zipcode,neighborhood,lat,long,**kwargs):
    rows = []
    for zipcode, neighborhood, lat, long in zip(zipcode,neighborhood,lat,long):
        print (zipcode,neighborhood,lat,long)
        ll = str(lat)+','+str(long)
        resp = venue_recs(ll=ll,query=None)['response']['groups'][0]['items']
        if len(resp) != 0:
            for re in resp:
                venue_name = re['venue']['name']
                lat = re['venue']['location']['lat']
                long = re['venue']['location']['lng']
                cat = re['venue']['categories'][0]['name']
                row = [zipcode,neighborhood,venue_name,cat,lat,long]
                rows.append(row)
        else:
            row = [zipcode,neighborhood]
            rows.append(row)
           # print('---',name,',',venue_name,'---',
            #      '\n',lat,',',long,
             #     '\n',cat,'\n')

    return rows

In [15]:
venues = get_venues(toronto['Postal Code'],toronto['Neighbourhood'],toronto['Latitude'],toronto['Longitude'])

M3A Parkwoods 43.7532586 -79.3296565
M4A Victoria Village 43.725882299999995 -79.31557159999998
M5A Regent Park, Harbourfront 43.6542599 -79.3606359
M6A Lawrence Manor, Lawrence Heights 43.718517999999996 -79.46476329999999
M7A Queen's Park, Ontario Provincial Government 43.6623015 -79.3894938
M9A Islington Avenue, Humber Valley Village 43.6678556 -79.53224240000002
M1B Malvern, Rouge 43.806686299999996 -79.19435340000001
M3B Don Mills 43.745905799999996 -79.352188
M4B Parkview Hill, Woodbine Gardens 43.7063972 -79.309937
M5B Garden District, Ryerson 43.6571618 -79.37893709999999
M6B Glencairn 43.709577 -79.44507259999999
M9B West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale 43.6509432 -79.55472440000001
M1C Rouge Hill, Port Union, Highland Creek 43.7845351 -79.16049709999999
M3C Don Mills 43.72589970000001 -79.340923
M4C Woodbine Heights 43.695343900000005 -79.3183887
M5C St. James Town 43.6514939 -79.3754179
M6C Humewood-Cedarvale 43.6937813 -79.42819140000002
M9

In [16]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['Postal Code','Neighborhood','Venue Name','Venue Category','Latitude','Longitude']
venues_df.head()

Unnamed: 0,Postal Code,Neighborhood,Venue Name,Venue Category,Latitude,Longitude
0,M3A,Parkwoods,Brookbanks Park,Park,43.751976,-79.33214
1,M3A,Parkwoods,Variety Store,Food & Drink Shop,43.751974,-79.333114
2,M4A,Victoria Village,Victoria Village Arena,Hockey Arena,43.723481,-79.315635
3,M4A,Victoria Village,Tim Hortons,Coffee Shop,43.725517,-79.313103
4,M4A,Victoria Village,Portugril,Portuguese Restaurant,43.725819,-79.312785


In [17]:
venues_df.shape

(695, 6)

In [18]:
print ('{} unique categories exist in venues_df'.format(len(venues_df['Venue Category'].unique())))

181 unique categories exist in venues_df


### Create a dataframe for venue categories

Note that we must use postal code and not neighborhoods because the neighborhood names occur multiple times in different postal codes. The postal codes, obviously, are unique. 

In [19]:
cat_df = pd.get_dummies(venues_df['Venue Category'])
cols = ['Postal Code', 'Neighborhood']+list(cat_df.columns)
cat_df[['Postal Code','Neighborhood']] = venues_df.iloc[:,[0,1]]
cat_df = cat_df[cols]
cat_df.head()

Unnamed: 0,Postal Code,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,American Restaurant,Antique Shop,Arts & Crafts Store,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4A,Victoria Village,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,Victoria Village,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,Victoria Village,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Create a new daraframe showing frequency of categories per postal code

In [20]:
grouped_df = cat_df.groupby(by='Postal Code').mean()
grouped_df.reset_index(inplace=True)
grouped_df
count = 0

grouped_df

Unnamed: 0,Postal Code,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,American Restaurant,Antique Shop,Arts & Crafts Store,Asian Restaurant,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M9N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,M9P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,M9R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,M9V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Get most common venues 
Borrowed from lab. Added code to ensure that the postal code has more than num_top_venues number of venues

In [21]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    vals = row_categories.value_counts()
    thresh = len(row_categories)-num_top_venues
    if vals[0]<=thresh:
        row_categories_sorted = row_categories.sort_values(ascending=False)
        return row_categories_sorted.index.values[0:num_top_venues]
    else:
        return None
    

In [22]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

common_df = pd.DataFrame(columns=columns)
common_df['Postal Code'] = grouped_df['Postal Code']

for ind in np.arange(grouped_df.shape[0]):
    common_df.iloc[ind, 1:] = return_most_common_venues(grouped_df.iloc[ind, :], num_top_venues)

common_df.dropna(axis=0,inplace=True)
common_df

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2,M1E,Restaurant,Rental Car Location,Breakfast Spot,Medical Center,Intersection
4,M1H,Thai Restaurant,Bakery,Hakka Restaurant,Gas Station,Athletics & Sports
7,M1L,Bakery,Bus Line,Metro Station,Bus Station,Intersection
11,M1R,Accessories Store,Auto Garage,Sandwich Place,Bakery,Middle Eastern Restaurant
12,M1S,Breakfast Spot,Lounge,Skating Rink,Clothing Store,Latin American Restaurant
...,...,...,...,...,...,...
89,M8W,Pizza Place,Sandwich Place,Coffee Shop,Pub,Pharmacy
92,M8Z,Grocery Store,Bakery,Discount Store,Sandwich Place,Burger Joint
95,M9C,Pharmacy,Café,Pizza Place,Convenience Store,Coffee Shop
99,M9P,Pizza Place,Coffee Shop,Sandwich Place,Chinese Restaurant,Intersection


In [23]:
# set number of clusters
kclusters = 3
grouped_over5_df = grouped_df['Postal Code'].isin(common_df['Postal Code'])
grouped_over5_df = grouped_df[grouped_over5_df]
cluster_df = grouped_over5_df.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=3).fit(cluster_df)

# check cluster labels generated for each row in the dataframe
k_labels = kmeans.labels_
k_labels

array([1, 1, 1, 1, 1, 2, 2, 1, 1, 0, 2, 1, 0, 1, 1, 2, 2, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 2, 1, 0, 2, 2],
      dtype=int32)

In [24]:
common_df.insert(0,'Cluster Labels',k_labels)
merged = pd.merge(common_df,toronto,on='Postal Code',how='inner')
merged

Unnamed: 0,Cluster Labels,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Borough,Neighbourhood,Latitude,Longitude
0,1,M1E,Restaurant,Rental Car Location,Breakfast Spot,Medical Center,Intersection,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
1,1,M1H,Thai Restaurant,Bakery,Hakka Restaurant,Gas Station,Athletics & Sports,Scarborough,Cedarbrae,43.773136,-79.239476
2,1,M1L,Bakery,Bus Line,Metro Station,Bus Station,Intersection,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
3,1,M1R,Accessories Store,Auto Garage,Sandwich Place,Bakery,Middle Eastern Restaurant,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
4,1,M1S,Breakfast Spot,Lounge,Skating Rink,Clothing Store,Latin American Restaurant,Scarborough,Agincourt,43.794200,-79.262029
...,...,...,...,...,...,...,...,...,...,...,...
60,2,M8W,Pizza Place,Sandwich Place,Coffee Shop,Pub,Pharmacy,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484
61,1,M8Z,Grocery Store,Bakery,Discount Store,Sandwich Place,Burger Joint,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999
62,0,M9C,Pharmacy,Café,Pizza Place,Convenience Store,Coffee Shop,Etobicoke,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",43.643515,-79.577201
63,2,M9P,Pizza Place,Coffee Shop,Sandwich Place,Chinese Restaurant,Intersection,Etobicoke,Westmount,43.696319,-79.532242


In [25]:
map_clusters = folium.Map(location=[lat, long], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merged['Latitude'], merged['Longitude'], merged['Postal Code'], merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [26]:
merged.loc[merged['Cluster Labels']==0]

Unnamed: 0,Cluster Labels,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Borough,Neighbourhood,Latitude,Longitude
9,0,M2N,Café,Grocery Store,Ice Cream Shop,Coffee Shop,Movie Theater,North York,"Willowdale, Willowdale East",43.77012,-79.408493
12,0,M3H,Bank,Coffee Shop,Restaurant,Pizza Place,Deli / Bodega,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259
22,0,M4M,Coffee Shop,Pet Store,Fish Market,Bookstore,Italian Restaurant,East Toronto,Studio District,43.659526,-79.340923
25,0,M4S,Dessert Shop,Café,Seafood Restaurant,Italian Restaurant,Sushi Restaurant,Central Toronto,Davisville,43.704324,-79.38879
26,0,M4V,Coffee Shop,American Restaurant,Liquor Store,Supermarket,Sushi Restaurant,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
27,0,M4X,Café,General Entertainment,Jewelry Store,Japanese Restaurant,Italian Restaurant,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
30,0,M5B,Café,Pizza Place,Theater,Music Venue,Plaza,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
31,0,M5C,Coffee Shop,Middle Eastern Restaurant,Gym,Cosmetics Shop,Creperie,Downtown Toronto,St. James Town,43.651494,-79.375418
33,0,M5G,Coffee Shop,Modern European Restaurant,Middle Eastern Restaurant,Park,Bubble Tea Shop,Downtown Toronto,Central Bay Street,43.657952,-79.387383
34,0,M5H,Coffee Shop,Hotel,Plaza,Café,Restaurant,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568


In [27]:
merged.loc[merged['Cluster Labels']==1]

Unnamed: 0,Cluster Labels,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Borough,Neighbourhood,Latitude,Longitude
0,1,M1E,Restaurant,Rental Car Location,Breakfast Spot,Medical Center,Intersection,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
1,1,M1H,Thai Restaurant,Bakery,Hakka Restaurant,Gas Station,Athletics & Sports,Scarborough,Cedarbrae,43.773136,-79.239476
2,1,M1L,Bakery,Bus Line,Metro Station,Bus Station,Intersection,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
3,1,M1R,Accessories Store,Auto Garage,Sandwich Place,Bakery,Middle Eastern Restaurant,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
4,1,M1S,Breakfast Spot,Lounge,Skating Rink,Clothing Store,Latin American Restaurant,Scarborough,Agincourt,43.7942,-79.262029
7,1,M2H,Golf Course,Pool,Mediterranean Restaurant,Dog Run,Athletics & Sports,North York,Hillcrest Village,43.803762,-79.363452
8,1,M2J,Movie Theater,Chocolate Shop,Bakery,Pharmacy,Shopping Mall,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
11,1,M3C,Discount Store,Gym,Sporting Goods Shop,Coffee Shop,Clothing Store,North York,Don Mills,43.7259,-79.340923
13,1,M3J,Caribbean Restaurant,Massage Studio,Coffee Shop,Metro Station,Bar,North York,"Northwood Park, York University",43.76798,-79.487262
14,1,M3L,Grocery Store,Hotel,Park,Shopping Mall,Bank,North York,Downsview,43.739015,-79.506944


In [28]:
merged.loc[merged['Cluster Labels']==2]

Unnamed: 0,Cluster Labels,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Borough,Neighbourhood,Latitude,Longitude
5,2,M1T,Pizza Place,Fried Chicken Joint,Fast Food Restaurant,Italian Restaurant,Intersection,Scarborough,"Clarks Corners, Tam O'Shanter, Sullivan",43.781638,-79.304302
6,2,M1W,Fast Food Restaurant,Pharmacy,Pizza Place,Grocery Store,Breakfast Spot,Scarborough,"Steeles West, L'Amoreaux West",43.799525,-79.318389
10,2,M2R,Pizza Place,Coffee Shop,Butcher,Pharmacy,Grocery Store,North York,"Willowdale, Willowdale West",43.782736,-79.442259
15,2,M4A,Pizza Place,Intersection,Coffee Shop,Portuguese Restaurant,Hockey Arena,North York,Victoria Village,43.725882,-79.315572
16,2,M4B,Pizza Place,Gym / Fitness Center,Bank,Bus Line,Pet Store,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
60,2,M8W,Pizza Place,Sandwich Place,Coffee Shop,Pub,Pharmacy,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484
63,2,M9P,Pizza Place,Coffee Shop,Sandwich Place,Chinese Restaurant,Intersection,Etobicoke,Westmount,43.696319,-79.532242
64,2,M9V,Pizza Place,Grocery Store,Pharmacy,Beer Store,Fast Food Restaurant,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
