# Clustering the neighbourhoods of Paris by venues and demographics

Created on Tue Jan 15 18:27:26 2019

@author: helderxpeixoto


In [8]:
#import packages for clustering and mapping
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib as plt

# import k-means from clustering stage
from sklearn.cluster import KMeans


#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


print('Libraries imported.')



Libraries imported.


In [9]:
address = 'Paris, FR'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Paris are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Paris are 48.8566101, 2.3514992.


In [10]:
paris_data = pd.read_csv("Workbook2.csv") 
paris_data.columns=['Arrondissement','Latitude','Longitude']
print(paris_data)

labels=paris_data['Arrondissement'].astype(str)

# create map of Paris using latitude and longitude values
map_paris = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(paris_data['Latitude'], paris_data['Longitude'], labels):
    label = folium.Popup(label, parse_html=False)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#4182bc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_paris)  
    
map_paris.save('map_paris.html')

    Arrondissement   Latitude  Longitude
0                2  48.868279   2.342803
1                3  48.862872   2.360001
2               12  48.834974   2.421325
3                1  48.862563   2.336443
4                4  48.854341   2.357630
5                8  48.872721   2.312554
6               14  48.829245   2.326542
7               19  48.887076   2.384821
8               20  48.863461   2.401188
9                6  48.849130   2.332898
10               9  48.877164   2.337458
11              17  48.887327   2.306777
12              15  48.840085   2.292826
13               5  48.844443   2.350715
14               7  48.856174   2.312188
15              10  48.876130   2.360728
16              11  48.859059   2.380058
17              13  48.828388   2.362272
18              16  48.860392   2.261971
19              18  48.892569   2.348161


In [11]:
#import Foursquare credentials and call API
CLIENT_ID = 'TMFCWQ4UVDK3MN3JXN2FDP4XW1VPI1RU3AOCYEE532BUVF3K' # your Foursquare ID
CLIENT_SECRET = 'HJGKVVO3WCXSB2KXGL4J5ULKNUV4EMAOB2HAG3HT0FRHHGCZ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TMFCWQ4UVDK3MN3JXN2FDP4XW1VPI1RU3AOCYEE532BUVF3K
CLIENT_SECRET:HJGKVVO3WCXSB2KXGL4J5ULKNUV4EMAOB2HAG3HT0FRHHGCZ


In [16]:
#repeat the function for all venues in Toronto
def getNearbyVenues(names, latitudes, longitudes, radius=1500, limit=10000,query=''):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):

            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&query={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            query,
            limit)  
        # make the GET request
        results = requests.get(url).json()['response']['venues']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,   
            v['categories']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Arrondissement', 
                  'Category']
    
    return(nearby_venues)
    

In [17]:
#run the above code on each neighborhood and create new df
paris_venues = getNearbyVenues(names=paris_data['Arrondissement'],
                                   latitudes=paris_data['Latitude'],
                                   longitudes=paris_data['Longitude']
                                  )

In [18]:
def get_category_type(row):
    try:
        categories_list = row['Category']
    except:
        categories_list = 0
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [19]:
filtered_columns = ['Category']
dataframe_filtered = paris_venues.loc[:, filtered_columns]


dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

dataframe_filtered.head()

paris_onehot = pd.get_dummies(dataframe_filtered['categories'], prefix="", prefix_sep="")

In [20]:
paris_venues.groupby('Arrondissement').count()
#identify number of unique categories for venues
print('There are {} uniques categories.'.format(len(dataframe_filtered['categories'].unique())))

# one hot encoding
paris_onehot = pd.get_dummies(dataframe_filtered['categories'], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
paris_onehot['Arrondissement'] = paris_venues['Arrondissement'] 

# move neighborhood column to the first column
fixed_columns = [paris_onehot.columns[-1]] + list(paris_onehot.columns[:-1])
paris_onehot = paris_onehot[fixed_columns]

paris_onehot.shape

#group rows by neighborhod and take mean of category occurance
paris_grouped = paris_onehot.groupby('Arrondissement').sum().reset_index()
#paris_grouped.sort_values(by=['Arrondissement'])

num_top_venues = 5
paris_grouped['Arrondissement']=paris_grouped['Arrondissement'].astype(str)

There are 331 uniques categories.


In [22]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [23]:
#create new dataframe and show top 10 venues for each neighborhood
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Arrondissement']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Arrondissement'] = paris_grouped['Arrondissement']

for ind in np.arange(paris_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(paris_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Arrondissement,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,1,French Restaurant,Office,Café,Hotel,Plaza
1,2,Office,Tech Startup,Building,Bar,French Restaurant
2,3,Art Gallery,Office,Coworking Space,Men's Store,Café
3,4,Art Gallery,Bakery,Garden,Italian Restaurant,Wine Bar
4,5,Bar,French Restaurant,Residential Building (Apartment / Condo),Italian Restaurant,Creperie
5,6,French Restaurant,Residential Building (Apartment / Condo),Salon / Barbershop,Police Station,Accessories Store
6,7,Government Building,French Restaurant,Café,Art Gallery,Office
7,8,Office,Art Gallery,Tech Startup,French Restaurant,Restaurant
8,9,Office,Tech Startup,Japanese Restaurant,Asian Restaurant,Italian Restaurant
9,10,Café,Hotel,Rental Car Location,Miscellaneous Shop,Cosmetics Shop


In [24]:
#group rows by neighborhod and take mean of category occurance
paris_grouped = paris_onehot.groupby('Arrondissement').sum().reset_index()
paris_grouped=paris_grouped.sort_values(by=['Arrondissement'])
paris_grouped.shape


paris_merged_clustering = paris_grouped.drop('Arrondissement', 1)

In [25]:
paris_merged = paris_data
#clustering neighborhoods into 5 distinct clusters
# set number of clusters
kclusters = 3

paris_grouped_clustering = paris_grouped.drop('Arrondissement', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(paris_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([2, 0, 2, 2, 2, 2, 2, 0, 1, 2, 2, 1, 1, 1, 1, 2, 0, 1, 1, 1],
      dtype=int32)

In [26]:
paris_merged = paris_data
neighborhoods_venues_sorted['Arrondissement']=neighborhoods_venues_sorted['Arrondissement'].astype(str)
paris_merged['Arrondissement']=paris_merged['Arrondissement'].astype(str)

# add clustering labels
paris_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
paris_merged = paris_merged.join(neighborhoods_venues_sorted.set_index('Arrondissement'), on='Arrondissement')

paris_merged=paris_merged.sort_values(by=['Cluster Labels'])



In [29]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.jet(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(paris_merged['Latitude'], paris_merged['Longitude'], paris_merged['Arrondissement'], paris_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=7,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

