# Compare Paris and Berlin Neighbourhoods

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

### Get list of Paris quarters from Wikipedia

In [2]:
# download wiki page for scraping
!wget -O paris_quartiers.html https://en.wikipedia.org/wiki/Quarters_of_Paris

--2020-03-11 16:09:19--  https://en.wikipedia.org/wiki/Quarters_of_Paris
Resolving en.wikipedia.org (en.wikipedia.org)... 208.80.154.224, 2620:0:861:ed1a::1
Connecting to en.wikipedia.org (en.wikipedia.org)|208.80.154.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71562 (70K) [text/html]
Saving to: ‘paris_quartiers.html’


2020-03-11 16:09:19 (1.23 MB/s) - ‘paris_quartiers.html’ saved [71562/71562]



In [3]:
!pip install beautifulsoup4



In [4]:
from bs4 import BeautifulSoup

# parse the html file
with open('paris_quartiers.html','r') as html_doc:
    soup = BeautifulSoup(html_doc, 'html.parser')

#print(soup.prettify())

In [5]:
# create a dataframe from the html file
df_paris = pd.DataFrame()
arrondissement = ""

for tr in soup.tbody.find_all('tr'):
    row = tr.text.split('\n')
    row = list(filter(lambda a: a != '', row)) # remove all empty strings from list

    if len(row)==5:
        arrondissement = ''.join(filter(lambda i: i.isdigit(), row[0].split()[0]))
        quartier = row[2]
    elif len(row)==4:
        quartier = row[1]
    else:
        print("Warning!! Row not added to dataframe. row length={}".format(len(row)))

    df_paris = df_paris.append([[quartier, arrondissement]])

df_paris.reset_index(drop=True, inplace=True)
df_paris.drop(index=0, inplace=True)

df_paris.columns=['quarter', 'postcode']
df_paris.postcode = df_paris.postcode.astype(int) + 75000
df_paris.reset_index(drop=True, inplace=True)

df_paris.head(10)

Unnamed: 0,quarter,postcode
0,Saint-Germain-l'Auxerrois,75001
1,Les Halles,75001
2,Palais-Royal,75001
3,Place-Vendôme,75001
4,Gaillon,75002
5,Vivienne,75002
6,Mail,75002
7,Bonne-Nouvelle,75002
8,Arts-et-Métiers,75003
9,Enfants-Rouges,75003


In [6]:
df_paris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   quarter   80 non-null     object
 1   postcode  80 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ KB


### Get geospatial coordinates for each quarter

In [None]:
!pip install geopy

In [22]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import time

geodata = pd.DataFrame()
    
for qrt, pc in zip(df_paris.quarter, df_paris.postcode):
    
    geolocator = Nominatim(user_agent='parisberlin')
    location = geolocator.geocode("{}, {}, France".format(qrt, pc))
    time.sleep(0.5) # pause execution for half a second after each request to avoid timeout error
    geodata = geodata.append([[location.latitude, location.longitude, location.address]])
    #print(location.raw)

geodata.columns = ['latitude', 'longitude', 'address']
geodata.reset_index(drop=True, inplace=True)

geodata.tail()

Unnamed: 0,latitude,longitude,address
75,48.877421,2.37102,"Colonel Fabien, Boulevard de la Villette, Quartier du Combat, Paris 19e Arrondissement, Paris, Île-de-France, France métropolitaine, 75019, France"
76,48.871727,2.385085,"Belleville, Paris 20e Arrondissement, Paris, Île-de-France, France métropolitaine, 75020, France"
77,48.870362,2.406736,"Quartier Saint-Fargeau, Paris 20e Arrondissement, Paris, Île-de-France, France métropolitaine, 75020, France"
78,48.861217,2.393929,"Cimetière du Père-Lachaise, 8, Boulevard de Ménilmontant, Saint-Blaise, Quartier de la Roquette, Paris 11e Arrondissement, Paris, Île-de-France, France métropolitaine, 75011, France"
79,48.856219,2.404895,"Quartier de Charonne, Paris 20e Arrondissement, Paris, Île-de-France, France métropolitaine, 75020, France"


In [19]:
df_paris['latitude'] = geodata['latitude']
df_paris['longitude'] = geodata['longitude']
df_paris['address'] = geodata['address']
df_paris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   quarter    80 non-null     object 
 1   postcode   80 non-null     int64  
 2   latitude   80 non-null     float64
 3   longitude  80 non-null     float64
 4   address    80 non-null     object 
dtypes: float64(2), int64(1), object(2)
memory usage: 3.2+ KB


In [20]:
df_paris

Unnamed: 0,quarter,postcode,latitude,longitude,address
0,Saint-Germain-l'Auxerrois,75001,48.860211,2.336299,"Quartier Saint-Germain-l'Auxerrois, Paris 1er Arrondissement, Paris, Île-de-France, France métropolitaine, 75001, France"
1,Les Halles,75001,48.862466,2.346009,"Les Halles, Allée Saint-John Perse, Quartier des Halles, Quartier Les Halles, Paris 1er Arrondissement, Paris, Île-de-France, France métropolitaine, 75001, France"
2,Palais-Royal,75001,48.863585,2.336204,"Palais Royal, Rue de Montpensier, Quartier des Halles, Quartier du Palais Royal, Paris 1er Arrondissement, Paris, Île-de-France, France métropolitaine, 75001, France"
3,Place-Vendôme,75001,48.867463,2.329428,"Place Vendôme, Quartier Saint-Georges, Quartier Vendôme, Paris 1er Arrondissement, Paris, Île-de-France, France métropolitaine, 75001, France"
4,Gaillon,75002,48.869135,2.332909,"Quartier Gaillon, Paris 2e Arrondissement, Paris, Île-de-France, France métropolitaine, 75002, France"
5,Vivienne,75002,48.868859,2.339363,"Quartier Vivienne, Paris 2e Arrondissement, Paris, Île-de-France, France métropolitaine, 75002, France"
6,Mail,75002,48.868054,2.344593,"Quartier du Mail, Paris 2e Arrondissement, Paris, Île-de-France, France métropolitaine, 75002, France"
7,Bonne-Nouvelle,75002,48.870623,2.34875,"Bonne Nouvelle, Boulevard de Bonne Nouvelle, Quartier des Halles, Quartier de Bonne-Nouvelle, Paris 2e Arrondissement, Paris, Île-de-France, France métropolitaine, 75002, France"
8,Arts-et-Métiers,75003,48.865441,2.356132,"Arts et Métiers, Rue de Turbigo, Beaubourg, Quartier des Arts-et-Métiers, Paris 3e Arrondissement, Paris, Île-de-France, France métropolitaine, 75003, France"
9,Enfants-Rouges,75003,48.864332,2.362611,"Quartier des Enfants-Rouges, Paris 3e Arrondissement, Paris, Île-de-France, France métropolitaine, 75003, France"


In [17]:
df_paris.to_csv('paris_quarters_with_gps_coords.csv', index = False)

### Plot Paris quarters on a map

In [23]:
!pip install folium



In [None]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library

geolocator = Nominatim(user_agent='parisberlin')
paris = geolocator.geocode('Paris, France')
print('The coordinates of {} are {}, {}.'.format(paris, paris.latitude, paris.longitude))

map_paris = folium.Map(location=[paris.latitude, paris.longitude], zoom_start=13)

# add markers to map
for qrt, pc, lat, lng in zip(df_paris.quarter, df_paris.postcode, df_paris.latitude, df_paris.longitude):
    label = '{}, {}'.format(qrt, pc)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_paris)
    
map_paris

### Get the top 100 venues for each quarter within a 500m radius

In [None]:
CLIENT_ID = 'Y0TRUGNQWLFLPJEPMIS5QIHZYYRIL5YLTKBBFMJUFHHYU1ZB' # your Foursquare ID
CLIENT_SECRET = '3SNL0QYABZR004UZTRVBTHF2PLIC2KKUL5D3JFGIHXBOAIPH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100
RADIUS = 500

In [None]:
import requests # library to handle requests
import json # library to handle JSON files
#from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

def getNearbyVenues(quarters, latitudes, longitudes, radius=RADIUS):
    '''
    This function gets the top 100 venues in a 500m radius for each latitude, longitude pair and returns them in a dataframe called nearby_venues.
    '''
    venues_list=[]
    for qrt, lat, lng in zip(quarters, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            qrt, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['quarter', 
                  'quarter_latitude', 
                  'quarter_longitude', 
                  'venue', 
                  'venue_latitude', 
                  'venue_longitude', 
                  'venue_category']
    
    return(nearby_venues)

In [None]:
# Create a dataframe with top 100 venues for each postcode.
paris_venues = getNearbyVenues(df_paris.quarter, df_paris.latitude, df_paris.longitude)

In [None]:
print(paris_venues.shape)
#paris_venues[paris_venues['quarter']=='Belleville']
paris_venues.head()

In [None]:
# Check number of venues for each quarter
paris_venues.groupby('quarter').count()

In [None]:
print('There are {} uniques categories.'.format(len(paris_venues['venue_category'].unique())))

Onehot code the venue categories for each postcode.

<a id='item3'></a>

In [None]:
# one hot encoding
tor_onehot = pd.get_dummies(tor_venues[['Venue Category']], prefix="", prefix_sep="")

# add postcode column back to dataframe
tor_onehot['Postcode'] = tor_venues['Postcode'] 

# move postcode column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

tor_onehot.head()

In [None]:
tor_onehot.shape

Group rows by postcode by taking the mean of the frequency of occurrence of each venue category.

In [None]:
tor_grouped = tor_onehot.groupby('Postcode').mean().reset_index()
tor_grouped

In [None]:
tor_grouped.shape

Print each postcode with its top 5 most common venues.

In [None]:
num_top_venues = 5

for pc in tor_grouped['Postcode']:
    print("----"+pc+"----")
    temp = tor_grouped[tor_grouped['Postcode'] == pc].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
def return_most_common_venues(row, num_top_venues):
    '''
    This function sorts the venues in descending order.
    '''
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create a new dataframe and display the top 10 venues for each postcode.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Postcode'] = tor_grouped['Postcode']

for ind in np.arange(tor_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

## 4. Cluster the Neighborhoods

In [None]:
from sklearn.cluster import KMeans

In [None]:
import matplotlib.cm as cm
import matplotlib.colors as colors

Run *k*-means to cluster the neighborhood into 5 clusters.

In [None]:
# set number of clusters
kclusters = 5

tor_grouped_clustering = tor_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

Create a new dataframe that includes the cluster label as well as the top 10 venues for each neighborhood.

In [None]:
# drop 'Cluster Labels' column if it already exists from a previous run
neighbourhoods_venues_sorted.drop('Cluster Labels', axis=1, inplace=True)

In [None]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tor_merged = df_paris

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
tor_merged = tor_merged.join(neighbourhoods_venues_sorted.set_index('Postcode'), on='Postcode')

tor_merged.head()

In [None]:
tor_merged.info()

In [None]:
# drop rows with no venue data
indexNames = tor_merged[tor_merged['Cluster Labels'].isnull()].index
tor_merged.drop(indexNames, inplace=True)
tor_merged.info()

Visualise the resulting clusters.

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lng, poi, cluster in zip(tor_merged['Latitude'], tor_merged['longitude'], tor_merged['Postcode'], tor_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<a id='item5'></a>

## 5. Examine Clusters

#### Cluster 0

In [None]:
df_cluster0 = tor_merged.loc[tor_merged['Cluster Labels'] == 0, tor_merged.columns[[0] + list(range(5, tor_merged.shape[1]))]]
df_cluster0

#### Cluster 1

In [None]:
df_cluster1 = tor_merged.loc[tor_merged['Cluster Labels'] == 1, tor_merged.columns[[0] + list(range(5, tor_merged.shape[1]))]]
df_cluster1

#### Cluster 2

In [None]:
df_cluster2 = tor_merged.loc[tor_merged['Cluster Labels'] == 2, tor_merged.columns[[0] + list(range(5, tor_merged.shape[1]))]]
df_cluster2

#### Cluster 3

In [None]:
df_cluster3 = tor_merged.loc[tor_merged['Cluster Labels'] == 3, tor_merged.columns[[0] + list(range(5, tor_merged.shape[1]))]]
df_cluster3

#### Cluster 4

In [None]:
df_cluster4 = tor_merged.loc[tor_merged['Cluster Labels'] == 4, tor_merged.columns[[0] + list(range(5, tor_merged.shape[1]))]]
df_cluster4