In [None]:
### Part 1 - Read the data from the Toronto Open Data dataset


In [None]:
import pandas as pd
import numpy as np

# Import Geocoder to get each area's coordinates
#!conda install -c conda-forge geocoder             #Uncomment this if geocoder is not installed on your system
import geocoder # import geocoder

# Import k-means for clustering stage
from sklearn.cluster import KMeans
# MinMaxScaler for normalization : to find the best K for K-Means
from sklearn.preprocessing import MinMaxScaler

# Import Folium to draw maps
#!conda install -c conda-forge folium                #Uncomment this if folium is not installed on your system
import folium

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

# Import Requests for querying FoursquareAPI
import requests

In [None]:
# Demographic data of Toronto's areas
CONST_demographicDataLink = "https://www.toronto.ca/ext/open_data/catalog/data_set_files/2016_neighbourhood_profiles.csv"

In [None]:
CLIENT_ID = 2GFKIZFPHZMTSMVADOZIQK2SHKOMAFGOBFL03UPJAFI1VU4C # your Foursquare ID
CLIENT_SECRET = HTHH43W3AKEFZH5XDJKZC0YXMQ0DH34HB5CJPT0IJZLEAIWF # your Foursquare Secret
VERSION = '20180604'
#VERSION = '20180605' # Foursquare API version

# Size of the radius to retrieve venues from FoursquareAPI, and limit of venues per neighbourhood
CONST_venuesRadiusScan = 1000
CONST_venuesLimit = 1000

In [None]:

df_raw = pd.read_csv(CONST_demographicDataLink, encoding = "cp1252", thousands=',')

In [None]:

# Get only data about areas information : filter on Characteristic
df_cityAreas = df_raw[df_raw.Characteristic == "Neighbourhood Number"]

# Drop the useless columns, only keep CDN x City_Area
df_cityAreas = df_cityAreas.drop(df_cityAreas.columns[[0,1,2,3,4]], axis=1)

# Format the dataframe
df_cityAreas = df_cityAreas.transpose()
df_cityAreas = df_cityAreas.reset_index()
df_cityAreas = df_cityAreas.iloc[:, ::-1]
df_cityAreas.columns = ['CDN', 'City_Area']
df_cityAreas.head()

In [None]:
# This function will count the leading space so we will know the data structure tree
def countLeadingSpaces(str):
    return len(str) - len(str.lstrip())

In [None]:
df_demographic = df_raw.apply(lambda x: x.str.replace(',',''))

# Filter on Topic = 'Ethnic origin population' to retrieve data about ethnic origins only
df_demographic = df_demographic[df_demographic.Topic == 'Ethnic origin population']

# Calculate the depth of Ethnic origin
df_demographic['Depth'] = df_demographic['Characteristic']

for index, row in df_demographic.iterrows():
    # The depth of Ethnic origin is calculated using the leading spaces in the data
    row['Depth'] = countLeadingSpaces(row['Depth'])

# Find each depth level of Ethnic origin, so we will be able to only keep the deepest levels
df_demographic['newLevel'] = np.where((df_demographic['Depth'].shift(+1) > df_demographic['Depth']),"new","")
    
# Reverse the dataframe
df_demographic = df_demographic.iloc[::-1

In [None]:

# Keep value will tell us if we need to keep the row (deepest level) or if we need to drop it
df_demographic['Keep'] = 0
tmpDepth = ""

# Loop through each row
for index, row in df_demographic.iterrows():
    # We keep the first element, because the dataframe has been reversed so the first row is a deepest level
    if tmpDepth == "":
        df_demographic.loc[index, 'Keep'] = 1
        
        # We keep the current level of depth in a temp variable, so we will be able to compare it in the next row iterration
        tmpDepth = row['Depth']
    
    else:
        # If the depth level of the previous row equals the depth level of the current row : same depth, so we keep the row
        if tmpDepth == row['Depth']:
            df_demographic.loc[index, 'Keep'] = 1
        
        # New level : we reset the temp variable
        elif row['newLevel'] == "new":
            # Reset tmpDepth
            tmpDepth = ""
            
# Reverse the dataframe
df_demographic = df_demographic.iloc[::-1]

# Keep only the necessary rows
df_demographic = df_demographic[df_demographic.Keep == 1]

# Drop the useless columns
df_demographic = df_demographic.drop(['Depth', 'newLevel', 'Keep'], 1)
df_demographic.head()

In [None]:
# Drop the useless columns
df_demographic.drop(df_demographic.columns[[0, 1, 2, 4]], axis=1, inplace=True)
df_demographic.rename(columns={'Characteristic':''}, inplace=True)

# Put the city areas names as the dataframe columns
df_demographic = df_demographic.transpose()
df_demographic.columns = df_demographic.iloc[0]
df_demographic = df_demographic.iloc[1:]

# Drop the Total column as we won't use it
df_demographic.drop(df_demographic.columns[[0]], axis=1, inplace=True)

# Format the dataframe
df_demographic = df_demographic.reset_index()
df_demographic.rename(columns={'index':'City_Area'}, inplace=True)
df_demographic.head()

In [None]:
### Part 2 - Add the latitude and the longitude coordinates of each city area


In [None]:

# This function returns the latitude and longitude of the given postal code, in Toronto
def getCoordsByCityArea(area):
    # initialize to None : this variable will allow us to loop until geocoder responds with the coordinates
    lat_lng_coords = None

    # loop until we get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(area))
        lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

In [None]:
areasWithCoordsList = []

# Loop through each postal code from our dataframe
for area in df_cityAreas['City_Area']:
    # Fill the temp list with the coordinates from geocoder
    latitude, longitude = getCoordsByCityArea(area)
    areasWithCoordsList.append([area, latitude, longitude])
    
# Transform the temp list into a dataframe
df_coords = pd.DataFrame(areasWithCoordsList)
df_coords.columns = ['City_Area', 'Latitude', 'Longitude']

# Merge the coordinates dataframe with the original neighbourhoods dataframe (key : postal code)
df_cityAreas = pd.merge(df_cityAreas, df_coords, on='City_Area')
df_cityAreas.head()

In [None]:
### Part 3 - Neighbourhoods clustering using ethnic origins


In [None]:
address = 'Toronto, ON'

# initialize to None
lat_lng_coords = None

# loop until we get the coordinates
while(lat_lng_coords is None):
    g = geocoder.arcgis('Toronto, Ontario')
    lat_lng_coords = g.latlng

latitude_toronto = lat_lng_coords[0]
longitude_toronto = lat_lng_coords[1]

print('The geographical coordinate of Toronto are {}, {}.'.format(latitude_toronto, longitude_toronto))

In [None]:

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=11)

# add markers to map
for lat, lng, CDN, City_Area in zip(df_cityAreas['Latitude'], df_cityAreas['Longitude'], df_cityAreas['CDN'], df_cityAreas['City_Area']):
    label = '{} - {}'.format(CDN, City_Area)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto) 
    
map_toronto

In [None]:

num_top = 10

# Iterate through all the dataframe
for index, row in df_demographic.iterrows():
    tempArea = row['City_Area']
    
    print("----"+tempArea+"----")
    
    # Create a temp df filtered on the current neighbourhood (key : City_Area)
    temp = df_demographic[
        (df_demographic.City_Area == tempArea)
    ].T.reset_index()
    
    temp.columns = ['Origin','Count']

    # We skip the key PostalCode x Borough x Neighbourhood : length = 3, iloc[3:]
    temp = temp.iloc[1:]
    temp['Count'] = temp['Count'].astype(float)
    
    # Round the frequency with two digits
    temp = temp.round({'Count': 2})
    
    print(temp.sort_values('Count', ascending=False).reset_index(drop=True).head(num_top))
    print('\n')

In [None]:

def return_most_common_ethn(row, num_top):
    # Remove the key from the row
    row_categories = row.iloc[1:].astype(float)
    
    # Sort ascending
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    # Return the top num_top_venues
    return row_categories_sorted.index.values[0:num_top]

In [None]:

num_top = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City_Area']

for ind in np.arange(num_top):
    try:
        columns.append('{}{} Most Common Origin'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Origin'.format(ind+1))

# create a new dataframe, and set it with the columns names
areas_ethn_sorted = pd.DataFrame(columns=columns)

# add the keys from the grouped dataframe (City_Area)
areas_ethn_sorted['City_Area'] = df_demographic['City_Area']

# loop through each rows
for ind in np.arange(df_demographic.shape[0]):
    areas_ethn_sorted.iloc[ind, 1:] = return_most_common_ethn(df_demographic.iloc[ind, :], num_top)

areas_ethn_sorted.head()

In [None]:
toronto_areas_clustering = df_demographic.drop('City_Area', 1)

data = toronto_areas_clustering.astype(float)

data.head()

In [None]:

mms = MinMaxScaler()
mms.fit(data)
data_transformed = mms.transform(data)

pd.DataFrame(data_transformed).head()

In [None]:

Nc = range(1, 20)
kmeans = [KMeans(n_clusters=i) for i in Nc]
kmeans
score = [kmeans[i].fit(data).score(data) for i in range(len(kmeans))]
score
plt.plot(Nc,score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()

In [None]:
# set number of clusters
kclusters = 5

toronto_areas_clustering = df_demographic.drop('City_Area', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(data)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:

# add clustering labels
areas_ethn_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [None]:

toronto_merged = df_cityAreas

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(areas_ethn_sorted.set_index(['City_Area']), on=['City_Area'])

toronto_merged.head() # check the last columns!

In [None]:

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cityArea, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['City_Area'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(cityArea) + ' - Cluster ' + str(cluster), parse_html=True)
    cluster = int(cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
### Cluster 0 regroups areas higly habited by European and Canadian people.
We can see English, Italian, Portuguese, French people ...
These clusters are represented by red circles on the map. We can see that most of them are positioned in almost all the south of Toronto, and in the downtown.

In [None]:
toronto_merged[toronto_merged['Cluster Labels'] == 0].head(50)

In [None]:
###Cluster 1 regroups areas higly habited by Chinese people, and people from others countries in Asia.
These are represented by violet circles on the map. We can see that most of them are positioned at the north of Toronto.

In [None]:

toronto_merged[toronto_merged['Cluster Labels'] == 1].head(50)

In [None]:
##Cluster 2 concentrates areas haghly habited by Indian people.
These clusters are represented by dark green. We can see that these areas are located at the edges of Toronto.

In [None]:

toronto_merged[toronto_merged['Cluster Labels'] == 2].head(10)

In [None]:
##Cluster 3 also regroups areas higly habited by asian people, the most common ethnic origin is Chinese.
These are represented by light green circles on the map. We can see that most of them are positioned at the north east of Toronto, next to the cluster 1.

In [None]:
toronto_merged[toronto_merged['Cluster Labels'] == 3].head(10)

In [None]:
##Cluster 4 regroups areas higly habited by English, Irish, Scottish and Canadian people.
We can also see that there are a lot of people from other european countries as well, such as French, German, Polish, ...
These are represented by yellow circles on the map. We can see that most of them are positioned at the south and in the downtown of Toronto.

In [None]:
toronto_merged[toronto_merged['Cluster Labels'] == 4].head(50)

In [None]:
##Part 4 - Use cases


In [None]:

def getNearbyVenues(cdn, latitudes, longitudes):
    
    venues_list=[]
    # Loop through each neighbourhood given in parameters
    for cdn, lat, lng in zip(cdn, latitudes, longitudes):

        # create the API request URL to explore the neighbourhood using FoursquareAPI
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            CONST_venuesRadiusScan, 
            CONST_venuesLimit)

        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue : name, latitude, longitude, and the categories' names
        venues_list.append([(
            cdn,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    # add the venues in the dataframe
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
                        'CDN',
                        'Area Latitude', 
                        'Area Longitude', 
                        'Venue', 
                        'Venue Latitude', 
                        'Venue Longitude', 
                        'Venue Category'
    ]
    
    return(nearby_venues)

In [None]:
##1/ Best spots for opening a new chinese restaurant

As we just saw, the city areas with the most concentration of chinese people are the ones which have beed categorised into the clusters 1 and 3.
These clusters represent the areas at the north east of Toronto.
Because there are a lot of possible neighbourhoods, we are going to use FoursquareAPI to see the number of chinese restaurants that already exist in each city area. We will assume that the areas with the fewer number of existing chinese restaurants within these clusters will represent the best areas to open a new chinese restaurant

In [None]:
toronto_chinese = toronto_merged[(toronto_merged['Cluster Labels'] == 3) | (toronto_merged['Cluster Labels'] == 1)]

toronto_chinese.head()

In [None]:
# Get the venues for each neighbourhood
chinese_restaurants_venues = getNearbyVenues(  
                                    cdn=toronto_chinese['CDN'],
                                    latitudes=toronto_chinese['Latitude'],
                                    longitudes=toronto_chinese['Longitude']
                                  )

chinese_restaurants_venues[(chinese_restaurants_venues['Venue Category'] == "Chinese Restaurant")].head()

In [None]:

chinese_restaurants_count = chinese_restaurants_venues.groupby(['CDN','Area Latitude', 'Area Longitude']).count().reset_index()

# Drop the useless columns
chinese_restaurants_count = chinese_restaurants_count.drop(chinese_restaurants_count.columns[[1,2,3,4,5]], axis=1)
chinese_restaurants_count.columns = ['CDN', 'Count']

# Merge the counts with the toronto_chinese dataframe
toronto_chinese = toronto_chinese.join(chinese_restaurants_count.set_index(['CDN']), on=['CDN'])
toronto_chinese = toronto_chinese.sort_values(by='Count')

toronto_chinese.head(15)

In [None]:
toronto_chinese_top = toronto_chinese.head(5)

# create map of Toronto using latitude and longitude values
spots_chinese_restaurants = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=11)

# add markers to map
for lat, lng, CDN, City_Area in zip(toronto_chinese_top['Latitude'], toronto_chinese_top['Longitude'], toronto_chinese_top['CDN'], toronto_chinese_top['City_Area']):
    label = '{} - {}'.format(CDN, City_Area)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(spots_chinese_restaurants) 
    
spots_chinese_restaurants

In [None]:
##2/ Best spots for opening a new irish pub

The city areas with the most concentration of irish, scottish and english people are the ones which have beed categorised into the clusters 4.
These clusters represent the areas in the south and the downtown of Toronto.
Just like the chinese restaurants, we are going to use FoursquareAPI and assume that the areas with the fewer number of existing irish pubs will represent the best areas to open a new irish pub.

In [None]:

toronto_irish = toronto_merged[(toronto_merged['Cluster Labels'] == 4)]

toronto_irish.head()

In [None]:

# Get the venues for each neighbourhood
irish_pubs_venues = getNearbyVenues(  
                                    cdn=toronto_irish['CDN'],
                                    latitudes=toronto_irish['Latitude'],
                                    longitudes=toronto_irish['Longitude']
                                  )

irish_pubs_venues[(irish_pubs_venues['Venue Category'] == "Pub")].head()

In [None]:

irish_pubs_count = irish_pubs_venues.groupby(['CDN','Area Latitude', 'Area Longitude']).count().reset_index()

# Drop the useless columns
irish_pubs_count = irish_pubs_count.drop(irish_pubs_count.columns[[1,2,3,4,5]], axis=1)
irish_pubs_count.columns = ['CDN', 'Count']

# Merge the counts with the toronto_chinese dataframe
toronto_irish = toronto_irish.join(irish_pubs_count.set_index(['CDN']), on=['CDN'])
toronto_irish = toronto_irish.sort_values(by='Count')

toronto_irish.head(15)

In [None]:
oronto_pubs_top = toronto_irish.head(5)

# create map of Toronto using latitude and longitude values
spots_irish_pubs = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=11)

# add markers to map
for lat, lng, CDN, City_Area in zip(toronto_pubs_top['Latitude'], toronto_pubs_top['Longitude'], toronto_pubs_top['CDN'], toronto_pubs_top['City_Area']):
    label = '{} - {}'.format(CDN, City_Area)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(spots_irish_pubs) 
    
spots_irish_pubs