In [1]:
from IPython.display import display
from tqdm import tqdm

# Small Czech Cities Classification

Let’s say, that someone has several job offers in different towns. Those offers are almost the same, but each is in different city so he will have to move. He loves his city, so he wants to move to some similar place. The task is to find out what smaller cities in Czech Republic are similar?


## Creating dataset
------
Before we can start with machine learning algorithms, we have to create and prepare our dataset. This is what I will do in thist part. So after importing necessary libraries this code will scarpe [wikipedia page](https://cs.wikipedia.org/wiki/Seznam_m%C4%9Bst_v_%C4%8Cesku_podle_po%C4%8Dtu_obyvatel) containing needed data. To do so I have used pandas function `read_html`, which works like magic.

In [2]:
import pandas as pd 
import numpy as np

tqdm.pandas()
import requests

from geopy.geocoders import Nominatim
import re
geolocator = Nominatim(user_agent="foursquare_agent")

import folium

In [27]:
link = 'https://cs.wikipedia.org/wiki/Seznam_m%C4%9Bst_v_%C4%8Cesku_podle_po%C4%8Dtu_obyvatel'

r=requests.get(link)

tables = pd.read_html(r.text)
raw_df = tables[0]
raw_df.rename(columns=dict(zip(raw_df.columns, ['Number', 'City', 'Pic', 'Population', 'Area', 'Distinct'])), inplace = True)
raw_df.pop('Number')
raw_df.pop('Pic')
raw_df.pop('Distinct')
raw_df.head(18)

Unnamed: 0,City,Population,Area
0,Praha,1 324 277,49621
1,Brno,381 346,23018
2,Ostrava,287 968,21423
3,Plzeň,174 842,13767
4,Liberec,104 802,10609
5,Olomouc,100 663,10333
6,České Budějovice,94 463,556
7,Hradec Králové,92 939,10569
8,Ústí nad Labem,92 716,9397
9,Pardubice,91 727,8266


Since I want only to cluster smalle cities, I will drop first two largest ones. I also dont want to work with too small cities, so I will only use cities with more than 50k population. For those cities I have attached theyr location using `Nominatim` from `geopy.geocoders` library.

In [40]:
df = raw_df.iloc[3:28].reset_index(drop=True)

def get_loc(city):
    loc = geolocator.geocode(city)
    return (loc.longitude, loc.latitude)

lng_lat = df['City'].progress_apply(get_loc)
df['lng'] = lng_lat.apply(lambda x:x[0])
df['lat'] = lng_lat.apply(lambda x:x[1]) 

100%|██████████| 25/25 [00:12<00:00,  2.05it/s]


In [41]:
df.head()

Unnamed: 0,City,Population,Area,lng,lat
0,Plzeň,174 842,13767,13.377525,49.747741
1,Liberec,104 802,10609,15.058395,50.770265
2,Olomouc,100 663,10333,17.251143,49.594057
3,České Budějovice,94 463,556,14.474285,48.974736
4,Hradec Králové,92 939,10569,15.832751,50.209211


To see what data we have right now I have used `folium` to show cities on map.

In [42]:
longitude = (df['lng'].max()+df['lng'].min())/2
lattitude = (df['lat'].max()+df['lat'].min())/2

In [43]:
f = folium.Figure(width=600, height=400)
my_map = folium.Map(location=[lattitude, longitude], zoom_start=6).add_to(f)

hoods = folium.map.FeatureGroup()
for index, row in df.iterrows():
    label = '{}'.format(row['City'])
    label = folium.Popup(label)
    hoods.add_child(
        folium.CircleMarker(
        [row['lat'],row['lng']],
        radius=5,
        popup=label,
        parse_html=False)
    )

my_map.add_child(hoods)
display(f)

To get infomation about venues in all cities I have used FourSquare API. 

In [44]:
CLIENT_ID = 'ZGBSSJDBVOA5V1KT5BKZJSLJMN2RIRJDS0BRABFHTD0XXVKJ'
CLIENT_SECRET = 'PLPRZCGGX3BFDT13Y4FPS2RGSIA2L3PTIQHGMD5MMMM3ZHE4'
ACCESS_TOKEN = '' # your FourSquare Access Token
VERSION = '20190101'
LIMIT = 300

In [45]:
filtered_columns = ['categories']

venues = pd.DataFrame()

radius = 5000

for index, row in df.iterrows():
    # url = 'https://api.foursquare.com/v2/venues/search?near={}&client_id={}&client_secret={}&v={}&limit={}'.format(row['City'], CLIENT_ID, CLIENT_SECRET, VERSION,LIMIT)
    # url = 'https://api.foursquare.com/v2/venues/search?&ll={},{}&client_id={}&client_secret={}&v={},&limit={}'.format(row['City'], CLIENT_ID, CLIENT_SECRET, VERSION,LIMIT)
    url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{},&llAcc=1&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, row['lat'], row['lng'], VERSION, radius, LIMIT)
    results = requests.get(url).json()
    # print(len(results['response']['venues']))
    venues_city = pd.json_normalize(results['response']['venues']).loc[:, filtered_columns]
    venues = venues.append(pd.concat([pd.DataFrame([row['City']]*len(venues_city), columns=['City']), venues_city], axis = 1))

In [46]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

venues['categories'] = venues.progress_apply(get_category_type, axis=1)
display(venues.head())
venues.dropna(inplace= True, subset= ['categories'])
venues.head()

venues.drop_duplicates(inplace=True)
len(venues)

100%|██████████| 2996/2996 [00:00<00:00, 106996.82it/s]


Unnamed: 0,City,categories
0,Plzeň,Tourist Information Center
1,Plzeň,Plaza
2,Plzeň,Monument / Landmark
3,Plzeň,Fountain
4,Plzeň,Bookstore


1877

Now in `venues` variable there is category for each venue in a city. Next part of code will create pivot table with count of venues in each cathegory for each city.

In [47]:
categories = venues['categories'].drop_duplicates().values
venues_pivot = venues[['City', 'categories']].pivot_table(index='City', columns= 'categories', aggfunc=np.count_nonzero).fillna(0)/2
venues_pivot = pd.DataFrame(venues_pivot.to_records())
venues_pivot.head()

Unnamed: 0,City,ATM,Accessories Store,Adult Education Center,Advertising Agency,Alternative Healer,American Restaurant,Animal Shelter,Antique Shop,Arcade,...,Water Park,Waterfront,Wedding Hall,Well,Whisky Bar,Wine Bar,Wine Shop,Winery,Women's Store,Yoga Studio
0,Chomutov,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Děčín,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Frýdek-Místek,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,Havířov,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Hradec Králové,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0


## Clustering

In this part I will cluster our cities by number of venues of certain category in each city. To do so, I have used kmeans implemented in sklearn library.

In [48]:
from sklearn.cluster import KMeans 

In [57]:
kclusters = 6

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venues_pivot[venues_pivot.columns[1:]])

In [58]:
try:
    df.insert(0, 'Cluster Labels', kmeans.labels_)
except ValueError as e:
    df['Cluster Labels'] = kmeans.labels_

Now I will just show results on map

In [59]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [60]:
f2 = folium.Figure(width=600, height=400)
my_map2 = folium.Map(location=[lattitude, longitude], zoom_start=6).add_to(f2)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

hoods = folium.map.FeatureGroup()
for index, row in df.iterrows():
    cluster = row['Cluster Labels']
    label = '{}'.format(row['City'])
    label = folium.Popup(label)
    hoods.add_child(
        folium.CircleMarker(
        [row['lat'],row['lng']],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7,
        parse_html=False)
    )

my_map2.add_child(hoods)
display(f2)