# The Most Suitable Place to Open Up a Cafe in Istanbul

### Imports

In [209]:
import folium
import warnings
import requests
import numpy as np
import pandas as pd
from folium import plugins
import matplotlib.cm as cm
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
import matplotlib.colors as colors
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize

### Getting Istanbul's Boroughs Information

In [210]:
# Web scraping atlasbig.com
table = pd.read_html('https://www.atlasbig.com/tr/istanbulun-mahalleleri', encoding = 'utf-8')
ist = table[0]
ist.rename(columns={'Mahalle': 'Neighborhood', 'İlçe': 'Borough', 'Nüfus': 'Population', 'Yüzölçümü (km2)': 'Area (km2)'}, inplace = True)

ist.head()

Unnamed: 0,Neighborhood,Borough,Population,Area (km2)
0,Atakent,Küçükçekmece,93.229,852
1,Adnan Kahveci,Beylikdüzü,86.584,459
2,Zafer,Bahçelievler,85.464,1085
3,Zümrütevler,Maltepe,82.651,3244
4,Halkalı Merkez,Küçükçekmece,78.18,4409


### Adding Geographic Information of Boroughs

In [211]:
# Copying and shaping the last created DataFrame
nwdf = ist.copy()
nwdf["Latitude"] = np.nan
nwdf["Longitude"] = np.nan
nwdf.drop(['Neighborhood'], axis=1, inplace = True)
nwdf = nwdf.drop_duplicates(subset='Borough', keep='first')

# Adding latitude and longitude information of each borough
for index, row in nwdf.iterrows():
    address = row['Borough']
    geolocator = Nominatim(user_agent="explorer")
    location = geolocator.geocode(address)
    if location != None:
        latitude = location.latitude
        longitude = location.longitude
        nwdf.loc[nwdf.Borough == row['Borough'], ['Latitude', 'Longitude']] = latitude, longitude

# Dropping NaN values and resetting index
nwdf.dropna(subset=['Latitude', 'Longitude'], inplace = True)
nwdf.reset_index(drop = True, inplace = True)

nwdf.head()

Unnamed: 0,Borough,Population,Area (km2),Latitude,Longitude
0,Küçükçekmece,93.229,852,41.000214,28.780889
1,Beylikdüzü,86.584,459,41.001026,28.641984
2,Bahçelievler,85.464,1085,38.881312,35.627761
3,Maltepe,82.651,3244,40.923542,29.132836
4,Başakşehir,74.815,13835,41.097693,28.806163


### Using Foursquare API to Get the Most Popular Venue Category

In [214]:
# Foursquare settings
CLIENT_ID = 'LFEN4Q2NPD1VGHAZC5FUPDROGL3UUKWYAWOG5OAFKLM1CVJ0'
CLIENT_SECRET = 'ZYRSZE05NUDXFCHNQTV12MYCNYJBXQ2QFOCLHMNQG4QRTXW1'
VERSION = '20180605'
LIMIT = 100

# Copying and shaping the last created DataFrame
final_table = nwdf.copy()
final_table["Most Popular"] = np.nan

# Extracting the category of venues
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
         
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    

# Creating a Foursquare URL and getting the most popular venue from it
for i in range(34):
    borough_latitude = final_table.loc[i, 'Latitude']
    borough_longitude = final_table.loc[i, 'Longitude'] 
    borough_name = final_table.loc[i, 'Borough']

    # Creating the URL with our variables
    LIMIT = 100
    radius = 500
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        borough_latitude, 
        borough_longitude, 
        radius, 
        LIMIT)
    
    # Getting the JSON that the URL returned, and getting venues from it
    results = requests.get(url).json()   
    venues = results['response']['groups'][0]['items']    
    nearby_venues = json_normalize(venues) 
    
    # Getting the most popular venue from the results above, if it exists
    try:
        desired_column = ['venue.categories']
        nearby_venues = nearby_venues.loc[:, desired_column]

        # Filtering the category for each row
        nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

        # Cleaning columns
        nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
        
        # Getting the mode value, and adding it to the DataFrame
        modd = nearby_venues.mode()
        popi = modd.at[0,'categories']
        final_table['Most Popular'][i] = popi
    
    # In case of the mode does not exist
    except:
        pass

# Shaping the DataFrame
final_table.dropna(subset=['Most Popular'], inplace = True)
final_table.reset_index(drop = True, inplace = True)

final_table.head()

Unnamed: 0,Borough,Population,Area (km2),Latitude,Longitude,Most Popular
0,Küçükçekmece,93.229,852,41.000214,28.780889,Turkish Restaurant
1,Beylikdüzü,86.584,459,41.001026,28.641984,Café
2,Maltepe,82.651,3244,40.923542,29.132836,Café
3,Başakşehir,74.815,13835,41.097693,28.806163,Café
4,Gaziosmanpaşa,73.225,138,41.057526,28.91565,Café


### Encoding DataFrame for Clustering

In [219]:
# One hot encoding
ist_onehot = pd.get_dummies(final_table[['Most Popular']], prefix="", prefix_sep="")

# Adding Borough column back to DataFrame
ist_onehot['Borough'] = final_table['Borough'] 

# Setting Borough as the first column
fixed = [ist_onehot.columns[-1]] + list(ist_onehot.columns[:-1])
ist_onehot = ist_onehot[fixed]

# Grouping the DataFrame by Borough values
ist_grouped = ist_onehot.groupby('Borough').mean().reset_index()

ist_grouped.head()

Unnamed: 0,Borough,Bakery,Bar,Bus Stop,Café,Cocktail Bar,Convenience Store,Gym,Hotel,Turkish Restaurant
0,Arnavutköy,0,0,0,1,0,0,0,0,0
1,Avcılar,0,0,0,1,0,0,0,0,0
2,Bakırköy,0,0,0,0,0,0,1,0,0
3,Bayrampaşa,0,0,0,1,0,0,0,0,0
4,Bağcılar,0,0,0,1,0,0,0,0,0


### Clustering

In [222]:
# Dropping Borough value since it is not numeric
ist_clustering = ist_grouped.drop('Borough', 1)

# Running K-Means clustering and fitting the new DataFrame
k = 5
kmeans = KMeans(n_clusters = k, random_state = 0).fit(ist_clustering)

#Adding the clustering labels to the DataFrame
final_table.insert(0, 'Cluster Labels', kmeans.labels_)
final_table.head()

Unnamed: 0,Cluster Labels,Borough,Population,Area (km2),Latitude,Longitude,Most Popular
0,1,Küçükçekmece,93.229,852,41.000214,28.780889,Turkish Restaurant
1,1,Beylikdüzü,86.584,459,41.001026,28.641984,Café
2,3,Maltepe,82.651,3244,40.923542,29.132836,Café
3,1,Başakşehir,74.815,13835,41.097693,28.806163,Café
4,1,Gaziosmanpaşa,73.225,138,41.057526,28.91565,Café


### Mapping

In [224]:
# Getting Istanbul's coordinates
address = 'Istanbul'
geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# Mapping with the newly acquired coordinates
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# Copying the main DataFrame in order to keep it safe
clustered_table = final_table.copy()

# Setting color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i * x) ** 2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Adding markers to the map
for lat, lon, poi, cluster in zip(clustered_table['Latitude'], clustered_table['Longitude'], clustered_table['Borough'], clustered_table['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True, encoding = 'utf-8')
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters