# Career Management by Hilton using locations cluster


First we import the libraries, that will be necessary to run the code:

In [1]:
import pandas as pd
!pip install wikitables
from wikitables import import_tables
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe



### Step 1: get the data from Wikitable and process them to get a dataframe

In [2]:
list_hotels = import_tables('List of properties of Hilton Worldwide')
print(list_hotels[0].name)
                

List of properties of Hilton Worldwide[0]


In [3]:
data_hotels= json.loads(list_hotels[0].json())

In [4]:
df= pd.DataFrame(data_hotels)
df

Unnamed: 0,City,Country,Name,Notes
0,Tirana,Albania,Hilton Garden Inn Tirana,
1,Algiers,Algeria,Hilton Algiers,Coming soon
2,Buenos Aires,Argentina,Hilton Buenos Aires,
3,Pilar,Argentina,Hilton Pilar,
4,Yerevan,Armenia,DoubleTree by Hilton Hotels Yerevan City Centre,
5,Palm Beach,Aruba,Hilton Aruba Caribbean Resort & Casino,
6,Adelaide,Australia,Hilton Adelaide,
7,Brisbane,Australia,Hilton Brisbane,
8,Cairns,Australia,Hilton Cairns,
9,Darwin,Australia,Hilton Darwin,


In [5]:
df= df.drop(['Notes'], axis=1)

### Step 2: get the list of European countries
As we want to limit the study to european countries, we first get a list of the corresponding countries.    <br>
The list is from Wikipedia as well. The datas a processed the same way as before.

In [6]:
list_countries = import_tables('List of European countries by area')
print(list_countries[0].name)

List of European countries by area[0]


In [7]:
data_countries= json.loads(list_countries[0].json())
df_countries= pd.DataFrame(data_countries)
df_countries

Unnamed: 0,Notes,Rank,State,Total area (km2)
0,"17,098,242 &nbsp; km 2 including Northern Asia",1,Russia *,3972400
1,,2,Ukraine,603628
2,"643,801 &nbsp; km 2 when the overseas departme...",3,France *,551695
3,"505,990 &nbsp; km 2 when the Canary Islands , ...",4,Spain *,498511
4,,5,Sweden,450295
5,This includes Svalbard and Jan Mayen,6,Norway,385178
6,,7,Germany,357386
7,,8,Finland,338145
8,,9,Poland,312685
9,,10,Italy,301338


In [8]:
df_countries = df_countries.drop(['Notes', 'Rank', 'Total area (km2)'], axis=1)
df_countries

Unnamed: 0,State
0,Russia *
1,Ukraine
2,France *
3,Spain *
4,Sweden
5,Norway
6,Germany
7,Finland
8,Poland
9,Italy


Let's clean up the names in order to have correspondance to the former data frame.

In [9]:
for i in range(len(df_countries)):
    if df_countries['State'][i][-1] == '*':
        df_countries['State'][i] = df_countries['State'][i][:-2]

In [10]:
df_countries

Unnamed: 0,State
0,Russia
1,Ukraine
2,France
3,Spain
4,Sweden
5,Norway
6,Germany
7,Finland
8,Poland
9,Italy


### Step 3: Filter the locations and make them unique

In [11]:
df_europe = pd.DataFrame(columns=df.columns)
df_europe

Unnamed: 0,City,Country,Name


In [12]:
for i in range(len(df)):
    if df['Country'][i] in df_countries['State'].values:
        df_europe=df_europe.append(df.loc[i], ignore_index=True)
df_europe

Unnamed: 0,City,Country,Name
0,Tirana,Albania,Hilton Garden Inn Tirana
1,Yerevan,Armenia,DoubleTree by Hilton Hotels Yerevan City Centre
2,Vienna,Austria,Hilton Vienna
3,Vienna,Austria,Hilton Vienna Danube Waterfront
4,Vienna,Austria,Hilton Vienna Plaza
5,Minsk,Belarus,DoubleTree by Hilton Hotel Minsk
6,Minsk,Belarus,Hampton by Hilton Minsk City Centre
7,Antwerp,Belgium,Hilton Antwerp City Centre
8,Brussels,Belgium,Hilton Brussels City
9,Brussels,Belgium,Hilton Brussels Grand Place


In [13]:
distinct_locations_europe = df_europe.groupby('City')['Name'].count().to_frame()
distinct_locations_europe.head()

Unnamed: 0_level_0,Name
City,Unnamed: 1_level_1
Aachen,1
Aberdeen,4
Adana,1
Adiyaman,1
Alamaty,1


In [14]:
distinct_locations_europe.reset_index(inplace = True)
distinct_locations_europe.columns= ['City', 'Qty_hotels']
distinct_locations_europe

Unnamed: 0,City,Qty_hotels
0,Aachen,1
1,Aberdeen,4
2,Adana,1
3,Adiyaman,1
4,Alamaty,1
5,Algarve,1
6,Amsterdam,6
7,Ankara,2
8,Antakya,1
9,Antalya,1


### Step 4: use Geopy and add Latitude and Longitude to each city

Geopy limits somehow the loops to 17 rows. Let's run the request several times to get the 231 answers.

In [171]:
!pip install geopy
from geopy.geocoders import Nominatim 



In [299]:
address = distinct_locations_europe['City'][150] # Aachen
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
distinct_locations_europe.loc[150, 'Latitude'] = location.latitude
distinct_locations_europe.loc[150, 'Longitude'] = location.longitude
distinct_locations_europe.head()

Unnamed: 0,City,Qty_hotels,Latitude,Longitude
0,Aachen,2,50.776351,6.083862
1,Aberdeen,8,57.148243,-2.092809
2,Adana,2,37.1438,35.498409
3,Adiyaman,2,37.78936,38.31411
4,Alamaty,2,43.238949,76.889709


In [244]:
for i in range(len(distinct_locations_europe)):
    address = distinct_locations_europe['City'][i]
    geolocator = Nominatim(user_agent="foursquare_agent")
    location = geolocator.geocode(address)
    if location != None:
        distinct_locations_europe.loc[i, 'Latitude'] = location.latitude
        distinct_locations_europe.loc[i, 'Longitude'] = location.longitude
        
distinct_locations_europe

KeyError: 231

In [245]:
distinct_locations_europe

Unnamed: 0,City,Qty_hotels,Latitude,Longitude
0,Aachen,2,50.776351,6.083862
1,Aberdeen,8,57.148243,-2.092809
2,Adana,2,37.143800,35.498409
3,Adiyaman,2,37.789360,38.314110
4,Alamaty,2,,
5,Algarve,2,37.245425,-8.150925
6,Amsterdam,12,52.372760,4.893604
7,Ankara,4,39.920777,32.854067
8,Antakya,2,36.219114,36.161628
9,Antalya,2,36.927965,30.727687


Let's check the missing values and correct them manually.

In [258]:
distinct_locations_europe.sort_values(['Latitude']).tail(15)

Unnamed: 0,City,Qty_hotels,Latitude,Longitude
15,Aviemore,2,57.193753,-3.82875
105,Kirov,2,58.603526,49.663903
163,Perm,2,58.884391,56.438995
190,Stockholm,2,59.325117,18.071093
197,Tallinn,2,59.437216,24.745369
174,Saint Petersburg,4,59.960674,30.158655
91,Helsinki,4,60.16741,24.942577
210,Vantaa,2,60.309187,25.036453
168,Reykjavik,6,64.145981,-21.942237
111,Krasnoyarsk,2,66.624407,94.742863


In [15]:
#Fill up the missing values manually with datas from https://www.latlong.net

distinct_locations_europe.loc[4, 'Latitude'] = 43.238949
distinct_locations_europe.loc[4, 'Longitude'] = 76.889709

distinct_locations_europe.loc[147, 'Latitude'] = 51.58774
distinct_locations_europe.loc[147, 'Longitude'] = -2.99835

distinct_locations_europe.loc[194, 'Latitude'] = 54.906101
distinct_locations_europe.loc[194, 'Longitude'] = -1.381130

distinct_locations_europe.loc[216, 'Latitude'] = 48.700001
distinct_locations_europe.loc[216, 'Longitude'] = 44.516666


Make a copy of the Data Frame as csv file

In [293]:
from project_lib import Project
project = Project(None, "....","...")

In [294]:
project.save_data(file_name = "Hilton.csv",data = distinct_locations_europe.to_csv(index=False))

{'file_name': 'Hilton.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'courseracapstone-donotdelete-pr-iwfz5ysqserla3',
 'asset_id': '34571736-df8f-4c9b-80d2-3336b740a20e'}

In [16]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,City,Qty_hotels,Latitude,Longitude
0,Aachen,2,37.245425,-8.150925
1,Aberdeen,8,57.148243,-2.092809
2,Adana,2,37.1438,35.498409
3,Adiyaman,2,37.78936,38.31411
4,Alamaty,2,43.238949,76.889709


Novosibirsk is in Russia, but not in Europe. We can drop the missing row

In [17]:
distinct_locations_europe = distinct_locations_europe.drop([154])

In [18]:
distinct_locations_europe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 230 entries, 0 to 230
Data columns (total 4 columns):
City          230 non-null object
Qty_hotels    230 non-null int64
Latitude      230 non-null float64
Longitude     230 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 9.0+ KB


After running the complete code, some errors were found on the map. Some cities have the same name. Let's correct the errors manually:

In [19]:
distinct_locations_europe[(distinct_locations_europe['City']=='Siena')|(distinct_locations_europe['City']=='Van')|(distinct_locations_europe['City']=='Sofia')|(distinct_locations_europe['City']=='Humberside')|(distinct_locations_europe['City']=='Warwick')|(distinct_locations_europe['City']=='Lincoln')]

Unnamed: 0,City,Qty_hotels,Latitude,Longitude
94,Humberside,2,43.657694,-79.492512
119,Lincoln,2,47.829907,-118.414727
182,Siena,2,35.000074,104.999927
185,Sofia,2,-15.25384,48.256216
209,Van,2,13.290403,108.426511
219,Warwick,2,41.700202,-71.416111


In [20]:
#Fill up the missing values manually with datas from https://www.latlong.net

distinct_locations_europe.loc[94, 'Latitude'] = 53.747372
distinct_locations_europe.loc[94, 'Longitude'] = -0.338653

distinct_locations_europe.loc[119, 'Latitude'] = 53.234444
distinct_locations_europe.loc[119, 'Longitude'] = -0.538611

distinct_locations_europe.loc[182, 'Latitude'] = 43.318611
distinct_locations_europe.loc[182, 'Longitude'] = 11.330556

distinct_locations_europe.loc[185, 'Latitude'] = 42.698334
distinct_locations_europe.loc[185, 'Longitude'] = 23.319941

distinct_locations_europe.loc[209, 'Latitude'] = 38.499817
distinct_locations_europe.loc[209, 'Longitude'] = 43.378143

distinct_locations_europe.loc[219, 'Latitude'] = 52.2833
distinct_locations_europe.loc[219, 'Longitude'] = -1.5833

distinct_locations_europe.loc[0, 'Latitude'] = 50.775555
distinct_locations_europe.loc[0, 'Longitude'] = 6.083611

In [21]:
df = distinct_locations_europe
df

Unnamed: 0,City,Qty_hotels,Latitude,Longitude
0,Aachen,2,50.775555,6.083611
1,Aberdeen,8,57.148243,-2.092809
2,Adana,2,37.143800,35.498409
3,Adiyaman,2,37.789360,38.314110
4,Alamaty,2,43.238949,76.889709
5,Algarve,2,37.245425,-8.150925
6,Amsterdam,12,52.372760,4.893604
7,Ankara,4,39.920777,32.854067
8,Antakya,2,36.219114,36.161628
9,Antalya,2,36.927965,30.727687


### Step 5: run the cluster using k-Means

In [22]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium==0.5.0 
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [23]:
#Define Foursquare

CLIENT_ID = '....' # your Foursquare ID
CLIENT_SECRET = '2X32A5BCJQPI1C5BFS5FHSWG0KIHMNEIJDHGYV0IHXHTFYCV' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: F0CC5OWAXLXZHWAUYGZR3PT4QA4H3TH5CGWYULIY2B4BNKYJ
CLIENT_SECRET:2X32A5BCJQPI1C5BFS5FHSWG0KIHMNEIJDHGYV0IHXHTFYCV


In [24]:
df.loc[0, 'City']
city_latitude = df.loc[0, 'Latitude'] # city latitude value
city_longitude = df.loc[0, 'Longitude'] # city longitude value

city_name = df.loc[0, 'City'] # city name

print('Latitude and longitude values of {} are {}, {}.'.format(city_name, 
                                                              city_latitude, 
                                                               city_longitude))

Latitude and longitude values of Aachen are 50.775555, 6.083611.


In [25]:
LIMIT = 200 # limit of number of venues returned by Foursquare API
radius = 600 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    city_latitude, 
    city_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=F0CC5OWAXLXZHWAUYGZR3PT4QA4H3TH5CGWYULIY2B4BNKYJ&client_secret=2X32A5BCJQPI1C5BFS5FHSWG0KIHMNEIJDHGYV0IHXHTFYCV&v=20180605&ll=50.775555,6.083611&radius=600&limit=200'

In [26]:
results = requests.get(url).json()

In [27]:
results

{'meta': {'code': 429,
  'errorType': 'quota_exceeded',
  'errorDetail': 'Quota exceeded',
  'requestId': '5e7e0517dd0f850028d6e5e2'},
 'response': {}}

In [139]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [149]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Aachener Dom St. Marien,Church,50.774702,6.084103
1,Katschhof,Plaza,50.775611,6.083909
2,Domhof,Plaza,50.774658,6.083297
3,AKL,Falafel Restaurant,50.776967,6.083277
4,Vertical Weinbar,Wine Bar,50.776288,6.081469


In [150]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

78 venues were returned by Foursquare.


In [151]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [153]:
hilton_venues = getNearbyVenues(names=df['City'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Aachen


KeyError: 'groups'

In [144]:
print(hilton_venues.shape)
hilton_venues.head()

(10416, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Aachen,50.775555,6.083611,Aachener Dom St. Marien,50.774702,6.084103,Church
1,Aachen,50.775555,6.083611,Katschhof,50.775611,6.083909,Plaza
2,Aachen,50.775555,6.083611,Domhof,50.774658,6.083297,Plaza
3,Aachen,50.775555,6.083611,AKL,50.776967,6.083277,Falafel Restaurant
4,Aachen,50.775555,6.083611,Vertical Weinbar,50.776288,6.081469,Wine Bar


In [82]:
 # Make a copy of the Data Frame as csv file

from project_lib import Project
project = Project(None, "cc243e6e-b797-456c-945c-156363111c72","p-8a5d93279d04cef93ae83211150ff54e2cec2838")

project.save_data(file_name = "Hilton_venues.csv",data = hilton_venues.to_csv(index=False), overwrite = True)

{'file_name': 'Hilton_venues.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'courseracapstone-donotdelete-pr-iwfz5ysqserla3',
 'asset_id': 'cad434ff-f793-443f-9240-307580c99e8a'}

In [28]:
#... so you can start again working on the next day even if Foursquare's quota is reached
body = client_ca49a1412575485390e9c38bb8931ea4.get_object(Bucket='courseracapstone-donotdelete-pr-iwfz5ysqserla3',Key='Hilton_venues.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

hilton_venues = pd.read_csv(body)
hilton_venues.head()


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Aachen,50.775555,6.083611,Aachener Dom St. Marien,50.774702,6.084103,Church
1,Aachen,50.775555,6.083611,Katschhof,50.775611,6.083909,Plaza
2,Aachen,50.775555,6.083611,Domhof,50.774658,6.083297,Plaza
3,Aachen,50.775555,6.083611,AKL,50.776967,6.083277,Falafel Restaurant
4,Aachen,50.775555,6.083611,Vertical Weinbar,50.776288,6.081469,Wine Bar


##### Analyze each Neighborhood

In [29]:
# one hot encoding
hilton_onehot = pd.get_dummies(hilton_venues[['Venue Category']], prefix="", prefix_sep="")

# add City column back to dataframe
hilton_onehot['Neighborhood'] = hilton_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [hilton_onehot.columns[-1]] + list(hilton_onehot.columns[:-1])
hilton_onehot = hilton_onehot[fixed_columns]

hilton_onehot.head()

Unnamed: 0,Çöp Şiş Place,ATM,Accessories Store,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,Alsatian Restaurant,...,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


#### .. and group them by occurence of each category

In [30]:
hilton_grouped = hilton_onehot.groupby('Neighborhood').mean().reset_index()
hilton_grouped

Unnamed: 0,Neighborhood,Çöp Şiş Place,ATM,Accessories Store,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,...,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Aachen,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.010000,0.000000,0.000000,0.0,0.00,0.0,0.000000,0.0
1,Aberdeen,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.0,0.000000,0.0
2,Adana,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.0,0.000000,0.0
3,Alamaty,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.0,0.000000,0.0
4,Amsterdam,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.010000,0.000000,0.000000,0.0,0.00,0.0,0.000000,0.0
5,Ankara,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.02,0.0,0.000000,0.0
6,Antakya,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.0,0.000000,0.0
7,Antalya,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.0,0.000000,0.0
8,Antwerp,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.010000,0.000000,0.000000,0.0,0.00,0.0,0.000000,0.0
9,Arundel,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.0,0.000000,0.0


Make a dataframe and process it:

In [31]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [32]:
import numpy as np

In [33]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
hilton_venues_sorted = pd.DataFrame(columns=columns)
hilton_venues_sorted['Neighborhood'] = hilton_grouped['Neighborhood']

for ind in np.arange(hilton_grouped.shape[0]):
    hilton_venues_sorted.iloc[ind, 1:] = return_most_common_venues(hilton_grouped.iloc[ind, :], num_top_venues)

hilton_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Aachen,Drugstore,Café,Ice Cream Shop,Bakery,Italian Restaurant,Plaza,Bar,Coffee Shop,Snack Place,German Restaurant
1,Aberdeen,Beer Bar,Bar,Seafood Restaurant,Performing Arts Venue,Sandwich Place,Supermarket,Pizza Place,Museum,Multiplex,Movie Theater
2,Adana,Moving Target,Zoo Exhibit,Flea Market,Fabric Shop,Factory,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field
3,Alamaty,Asian Restaurant,Café,Karaoke Bar,Paper / Office Supplies Store,Turkish Restaurant,Beer Bar,Grocery Store,Gym,Sports Club,Pub
4,Amsterdam,Hotel,Bar,Coffee Shop,Marijuana Dispensary,Clothing Store,Café,French Restaurant,Cocktail Bar,Italian Restaurant,Bakery


In [34]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [37]:
# set number of clusters
kclusters = 6

hilton_grouped_clustering = hilton_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, n_init=12, random_state=0).fit(hilton_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 2, 0, 0, 0, 0, 0, 0, 3], dtype=int32)

In [36]:
# Reset the dataframe for more trials
hilton_venues_sorted = pd.DataFrame(columns=columns)
hilton_venues_sorted['Neighborhood'] = hilton_grouped['Neighborhood']

for ind in np.arange(hilton_grouped.shape[0]):
    hilton_venues_sorted.iloc[ind, 1:] = return_most_common_venues(hilton_grouped.iloc[ind, :], num_top_venues)

hilton_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Aachen,Drugstore,Café,Ice Cream Shop,Bakery,Italian Restaurant,Plaza,Bar,Coffee Shop,Snack Place,German Restaurant
1,Aberdeen,Beer Bar,Bar,Seafood Restaurant,Performing Arts Venue,Sandwich Place,Supermarket,Pizza Place,Museum,Multiplex,Movie Theater
2,Adana,Moving Target,Zoo Exhibit,Flea Market,Fabric Shop,Factory,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field
3,Alamaty,Asian Restaurant,Café,Karaoke Bar,Paper / Office Supplies Store,Turkish Restaurant,Beer Bar,Grocery Store,Gym,Sports Club,Pub
4,Amsterdam,Hotel,Bar,Coffee Shop,Marijuana Dispensary,Clothing Store,Café,French Restaurant,Cocktail Bar,Italian Restaurant,Bakery


In [38]:
# add clustering labels
hilton_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

hilton_merged = df

# merge hilton_grouped with df to add latitude/longitude for each neighborhood
hilton_merged = hilton_merged.join(hilton_venues_sorted.set_index('Neighborhood'), on='City')

hilton_merged.head() # check the last columns!

Unnamed: 0,City,Qty_hotels,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Aachen,2,50.775555,6.083611,0.0,Drugstore,Café,Ice Cream Shop,Bakery,Italian Restaurant,Plaza,Bar,Coffee Shop,Snack Place,German Restaurant
1,Aberdeen,8,57.148243,-2.092809,0.0,Beer Bar,Bar,Seafood Restaurant,Performing Arts Venue,Sandwich Place,Supermarket,Pizza Place,Museum,Multiplex,Movie Theater
2,Adana,2,37.1438,35.498409,2.0,Moving Target,Zoo Exhibit,Flea Market,Fabric Shop,Factory,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field
3,Adiyaman,2,37.78936,38.31411,,,,,,,,,,,
4,Alamaty,2,43.238949,76.889709,0.0,Asian Restaurant,Café,Karaoke Bar,Paper / Office Supplies Store,Turkish Restaurant,Beer Bar,Grocery Store,Gym,Sports Club,Pub


In [39]:
# Localize the center of the future map in Vienna
center_lat = distinct_locations_europe.loc[214, 'Latitude']
center_long = distinct_locations_europe.loc[214, 'Longitude']

In [40]:
hilton_merged['Cluster Labels'].value_counts()

0.0    127
3.0     68
4.0      2
5.0      1
1.0      1
2.0      1
Name: Cluster Labels, dtype: int64

In [41]:
hilton_merged['Cluster Labels'].values

array([ 0.,  0.,  2., nan,  0., nan,  0.,  0.,  0.,  0.,  0.,  3.,  0.,
        0., nan,  0.,  3.,  3.,  0.,  0.,  3.,  3.,  0.,  0.,  3.,  0.,
        0.,  3.,  4.,  0.,  3.,  0.,  3.,  0.,  0.,  0.,  0.,  0.,  3.,
        0.,  3.,  3., nan,  0.,  0.,  3.,  0.,  0.,  3.,  3.,  0., nan,
        3.,  3.,  3.,  3.,  0., nan,  3.,  3.,  0.,  0.,  3.,  3.,  3.,
        0.,  0.,  3.,  0.,  0.,  1., nan, nan,  0.,  0.,  3.,  0.,  0.,
        0.,  0.,  3.,  3.,  5.,  3.,  3.,  0.,  0.,  3.,  0.,  3.,  3.,
        0.,  0.,  0.,  3.,  0., nan,  3., nan,  0.,  0., nan,  0.,  4.,
        0.,  0.,  0.,  0.,  0.,  3.,  0., nan,  0., nan,  0.,  0., nan,
        0.,  0.,  0.,  0.,  3.,  0.,  0.,  0.,  3.,  3.,  3.,  0.,  3.,
        0.,  0., nan,  0.,  0., nan,  0.,  3.,  0.,  0.,  3., nan, nan,
        0.,  0.,  3.,  0.,  3.,  0.,  0.,  3.,  0.,  0.,  0.,  0., nan,
        0., nan,  0.,  0.,  0.,  0., nan,  0.,  0.,  0.,  3.,  0.,  3.,
        3.,  0.,  3., nan, nan,  0., nan, nan,  0.,  0.,  0.,  0

In [42]:
hilton_merged['Cluster Labels']= hilton_merged['Cluster Labels'].replace(np.nan, (kclusters))

In [43]:
hilton_merged['Cluster Labels']= hilton_merged['Cluster Labels'].astype('int')

In [44]:
hilton_merged['Cluster Labels'].value_counts()

0    127
3     68
6     30
4      2
5      1
2      1
1      1
Name: Cluster Labels, dtype: int64

In [45]:
#check colors

x = np.arange(kclusters+1)
ys = [i + x + (i*x)**2 for i in range(kclusters+1)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
rainbow

['#8000ff', '#2c7ef7', '#2adddd', '#80ffb4', '#d4dd80', '#ff7e41', '#ff0000']

In [46]:
# create map
map_clusters = folium.Map(width=800,height=800, location=[center_lat, center_long], zoom_start=4)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(hilton_merged['Latitude'], hilton_merged['Longitude'], hilton_merged['City'], hilton_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [130]:
# Reset the dataframe for more trials
hilton_venues_sorted = pd.DataFrame(columns=columns)
hilton_venues_sorted['Neighborhood'] = hilton_grouped['Neighborhood']

for ind in np.arange(hilton_grouped.shape[0]):
    hilton_venues_sorted.iloc[ind, 1:] = return_most_common_venues(hilton_grouped.iloc[ind, :], num_top_venues)


In [131]:
# set number of clusters
kclusters = 20

hilton_grouped_clustering = hilton_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, n_init=10, random_state=0).fit(hilton_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([ 4,  4,  2,  4,  4,  4,  4,  4,  4, 18], dtype=int32)

In [132]:
# add clustering labels
hilton_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

hilton_merged = df

# merge hilton_grouped with df to add latitude/longitude for each neighborhood
hilton_merged = hilton_merged.join(hilton_venues_sorted.set_index('Neighborhood'), on='City')

#check the impact
hilton_merged['Cluster Labels'].value_counts()

4.0     110
18.0     42
8.0      12
5.0       9
0.0       8
12.0      5
2.0       1
3.0       1
15.0      1
16.0      1
19.0      1
9.0       1
1.0       1
11.0      1
7.0       1
13.0      1
10.0      1
6.0       1
14.0      1
17.0      1
Name: Cluster Labels, dtype: int64

In [134]:
hilton_merged['Cluster Labels'].values

array([ 4.,  4.,  2., nan,  4., nan,  4.,  4.,  4.,  4.,  4., 18.,  4.,
        4., nan, 18.,  3., 12.,  4.,  4., 18.,  4.,  4.,  4.,  0.,  4.,
       18., 12., 15.,  4., 12.,  4., 18.,  4., 18.,  4.,  4.,  4.,  0.,
        4., 18., 18., nan, 18., 18., 18.,  4.,  4.,  4., 16.,  4., nan,
       18., 18., 18., 18.,  8., nan, 18., 12.,  4.,  4., 18.,  4.,  4.,
        4.,  4., 18.,  4.,  4., 17., nan, nan,  4.,  4.,  8.,  5.,  4.,
        4.,  4., 18.,  4.,  1.,  5.,  5.,  4.,  4., 11.,  8., 18.,  4.,
        4.,  0., 18., 18.,  4., nan, 18., nan,  4.,  4., nan,  4.,  7.,
        4.,  4., 13.,  4.,  4., 10.,  4., nan,  4., nan,  4.,  4., nan,
        4.,  4., 18.,  4., 18.,  4.,  4.,  4.,  6.,  0.,  0.,  4., 18.,
        4.,  4., nan,  4.,  4., nan,  4.,  8., 18.,  4.,  5., nan, nan,
        4.,  4., 14., 18., 18.,  4.,  4.,  8., 18., 18.,  4.,  4., nan,
        4., nan,  4.,  8., 18.,  4., nan,  4.,  4.,  4., 12.,  4.,  8.,
        4.,  4.,  9., nan, nan,  4., nan, nan,  4.,  4., 18.,  4