## Classify largest European cities according to theier ability to host multisport event (Football, Hockey, Tennis)

#### This notebook aims to classify 36 largest European cities based on their suitability to host multi-sport event. This is based on the number of football, hockey and tennis stadiums located in 3 km area around the city center. Cities are grouped to 4 categories and displayed on the map.

##### Data sources: Foursquare (venues & coordinates), Wikipedia (Europe largest cities)

In [311]:
import numpy as np  # useful for many scientific computing in Python
import pandas as pd # primary data structure library
import requests
import lxml.html as lh
import json
from bs4 import BeautifulSoup
from IPython.display import display_html
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [312]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Folium installed and imported!')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Folium installed and imported!


#### Get European largest cities by population from wikipedia

In [357]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_European_cities_by_population_within_city_limits').text
soup = BeautifulSoup(website_url,'lxml')

In [379]:
My_table = soup.find('table',{'class':'wikitable sortable'})
dfs = pd.read_html(str(My_table))
df = dfs[0]
df.head()

Unnamed: 0.1,Unnamed: 0,City,Country,Officialpopulation,Date,2011 Eurostatpopulation[1],Image,Location,Ref.
0,1,Istanbul[a],Turkey,15067724,31 December 2018,,,41°00′49″N 28°57′18″E﻿ / ﻿41.013611°N 28.955°E,[2]
1,2,Moscow[b],Russia,12615279,1 January 2019,,,55°45′00″N 37°37′00″E﻿ / ﻿55.75°N 37.616667°E,[3]
2,3,London,United Kingdom,9126366,31 December 2018,8173941.0,,51°30′26″N 0°07′39″W﻿ / ﻿51.507222°N 0.1275°W,[4]
3,4,Saint Petersburg,Russia,5383890,1 January 2019,,,59°57′N 30°18′E﻿ / ﻿59.95°N 30.3°E,[5][6]
4,5,Berlin,Germany,3748148,31 December 2018,3460725.0,,52°31′00″N 13°23′00″E﻿ / ﻿52.516667°N 13.383333°E,[7]


#### Transform the dataframe

In [380]:
df.drop(['Unnamed: 0','2011 Eurostatpopulation[1]','Image','Date','Ref.','Location'],axis=1,inplace=True)

In [381]:
df['City'] = df['City'].str.replace('\[a\]','')
df['City'] = df['City'].str.replace('\[b\]','')

In [382]:
df.head()

Unnamed: 0,City,Country,Officialpopulation
0,Istanbul,Turkey,15067724
1,Moscow,Russia,12615279
2,London,United Kingdom,9126366
3,Saint Petersburg,Russia,5383890
4,Berlin,Germany,3748148


#### Define and run a function, which returns the coordinates of the city center and the number of stadiums

In [383]:
def get_venuecount(CITY,CATEGORYID):
    CLIENT_ID = 'CEQNS33ZURPTBCERETH3SKSNEAY5YY0KPCCUKPIITDIDKLRP' # Foursquare ID
    CLIENT_SECRET = '2AS14YU31JJPSWASBG3WX4SGJHHJM2OJTSDW4V0WLYUB5KQD' # Foursquare Secret
    VERSION = '20180605' # Foursquare API version
    url = 'https://api.foursquare.com/v2/venues/explore?near={}&client_id={}&client_secret={}&v={}&categoryId={}&radius=3000'.format(
    CITY,
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION,
    CATEGORYID)
    foursquaredata ={
  "venues": 0,
  "lat": 0,
  "lng": 0
}
    item_dict = requests.get(url).json()
    if 'groups' in item_dict['response']:
        foursquaredata["venues"] = len(item_dict['response']['groups'][0]['items'])
        foursquaredata["lat"] = item_dict['response']['geocode']['center']['lat']
        foursquaredata["lng"] = item_dict['response']['geocode']['center']['lng']
    return foursquaredata



In [384]:
df['lat'] = df.apply(lambda x: get_venuecount(x['City'],'4bf58dd8d48988d189941735')['lat'],axis=1)
df['lng'] = df.apply(lambda x: get_venuecount(x['City'],'4bf58dd8d48988d189941735')['lng'],axis=1)
df['Footbal Stadiums'] = df.apply(lambda x: get_venuecount(x['City'],'4bf58dd8d48988d189941735')['venues'],axis=1)
df['Tennis Stadiums'] = df.apply(lambda x: get_venuecount(x['City'],'4e39a891bd410d7aed40cbc2')['venues'],axis=1)
df['Hockey Stadiums'] = df.apply(lambda x: get_venuecount(x['City'],'4bf58dd8d48988d185941735')['venues'],axis=1)


#### Please note, thate that the results are affected by limitations of the data source - Foursquare

In [385]:
df

Unnamed: 0,City,Country,Officialpopulation,lat,lng,Footbal Stadiums,Tennis Stadiums,Hockey Stadiums
0,Istanbul,Turkey,15067724,41.01384,28.94966,19,23,5
1,Moscow,Russia,12615279,55.75222,37.61556,12,30,14
2,London,United Kingdom,9126366,51.50853,-0.12574,30,15,2
3,Saint Petersburg,Russia,5383890,59.93863,30.31413,25,17,21
4,Berlin,Germany,3748148,52.52437,13.41053,30,7,2
5,Madrid,Spain,3223334,40.4165,-3.70256,13,7,2
6,Kiev,Ukraine,2950819,50.45466,30.5238,20,15,4
7,Rome,Italy,2857321,41.89193,12.51133,4,6,3
8,Paris,France,2140526,48.85341,2.3488,20,12,1
9,Bucharest,Romania,2106144,44.43225,26.10626,6,9,1


#### Cluster the results based on the numbers of footbal, hockey and tennis stadiums (as the numbers are in the same range, they will not be normalized).

In [391]:
from sklearn.cluster import KMeans

In [388]:
dfclustering = df.drop(['City','Country','Officialpopulation','lat','lng'],axis=1)

In [401]:
# set number of clusters
kclusters = 4

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dfclustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([3, 3, 1, 3, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 0, 3, 0,
       0, 0, 2, 3, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2])

#### Now add the computed categories to the original dataframe and show categorized cities on the map

In [406]:
import folium # map rendering library
import matplotlib.cm as cm
import matplotlib.colors as colors

In [402]:
df['Category'] = kmeans.labels_

In [403]:
df.head()

Unnamed: 0,City,Country,Officialpopulation,lat,lng,Footbal Stadiums,Tennis Stadiums,Hockey Stadiums,Category
0,Istanbul,Turkey,15067724,41.01384,28.94966,19,23,5,3
1,Moscow,Russia,12615279,55.75222,37.61556,12,30,14,3
2,London,United Kingdom,9126366,51.50853,-0.12574,30,15,2,1
3,Saint Petersburg,Russia,5383890,59.93863,30.31413,25,17,21,3
4,Berlin,Germany,3748148,52.52437,13.41053,30,7,2,1


In [410]:
# create map
map_clusters = folium.Map(location=[46.7712, 23.6236], zoom_start=4)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['lat'], df['lng'], df['City'], df['Category']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### The map shows the largest European cities colored according to the their (virtual) capability to host a multi-sport event. The most suitable cities are marked yellow, the least suitable ones are light blue