# Holiday Destination Recommender

## 1. Prepare Necessary Libraries

In [211]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [212]:
# retrieving lat and long
from geopy.geocoders import Nominatim 

# data science stuffs
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# visualization stuffs
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# utilities
import requests 
import time

## 2. Retrieve Data

### 2.1 Get Coordinates

Enter the cities you like to find something similar for:

In [213]:
visited = ["Seoul, Korea", "New York City, USA"]

Retrieve coordinates:

In [214]:
dfVisited = pd.DataFrame(columns=['Place','Latitude','Longitude'])

for place in visited:        
    geolocator = Nominatim(user_agent="coursera_hhu")
    location = geolocator.geocode(place)
    try:
        latitude = location.latitude
        longitude = location.longitude
        print('The geograpical coordinate of {} are {}, {}.'.format(place, latitude, longitude))
        dfVisited = dfVisited.append({'Place' : place , 'Latitude' : latitude, 'Longitude' : longitude} , ignore_index=True)
    except:
        print(place + " not found")
    
dfVisited 

The geograpical coordinate of Seoul, Korea are 37.5666791, 126.9782914.
The geograpical coordinate of New York City, USA are 40.7127281, -74.0060152.


Unnamed: 0,Place,Latitude,Longitude
0,"Seoul, Korea",37.566679,126.978291
1,"New York City, USA",40.712728,-74.006015


Load list of all cities in this world (found @ https://datahub.io/core/world-cities):

In [215]:
dfCities = pd.read_csv("world-cities.csv")

Randomly pick 100 cities and retrieve coordinates:

In [216]:
dfCities = dfCities.sample(100)

new = []

for index, row in dfCities.iterrows():
    new.append(row['name'] + ', ' + row['country'])
    
dfNew = pd.DataFrame(columns=['Place','Latitude','Longitude'])

for place in new:        
    geolocator = Nominatim(user_agent="coursera_hhu")
    location = geolocator.geocode(place)
    try:
        latitude = location.latitude
        longitude = location.longitude
        print('The geograpical coordinate of {} are {}, {}.'.format(place, latitude, longitude))
        dfNew = dfNew.append({'Place' : place , 'Latitude' : latitude, 'Longitude' : longitude} , ignore_index=True)
    except:
        print(place + " not found")
    time.sleep(1)
    
dfNew 

The geograpical coordinate of Elmalı, Turkey are 36.7255882, 29.94244233933763.
The geograpical coordinate of Caxito, Angola are -8.5827504, 13.6600788.
Staryy Oskol, Russia not found
The geograpical coordinate of Montauban, France are 44.0175835, 1.3549991.
The geograpical coordinate of Kaeng Khro, Thailand are 16.105979, 102.4417476.
The geograpical coordinate of Boysun, Uzbekistan are 38.2040894, 67.2033601.
The geograpical coordinate of San Dimas, United States are 34.1066756, -117.8067257.
The geograpical coordinate of Coishco, Peru are -9.0287109, -78.61773976604151.
The geograpical coordinate of Romblon, Philippines are 12.5, 122.25.
The geograpical coordinate of Arraial do Cabo, Brazil are -22.9662839, -42.024427.
The geograpical coordinate of Uravakonda, India are 14.8722142, 77.26206804106354.
The geograpical coordinate of Wazīrābād, Pakistan are 32.4417427, 74.1182117.
The geograpical coordinate of Isehara, Japan are 35.4023968, 139.2996106.
The geograpical coordinate of Iwa

Unnamed: 0,Place,Latitude,Longitude
0,"Elmalı, Turkey",36.725588,29.942442
1,"Caxito, Angola",-8.582750,13.660079
2,"Montauban, France",44.017584,1.354999
3,"Kaeng Khro, Thailand",16.105979,102.441748
4,"Boysun, Uzbekistan",38.204089,67.203360
5,"San Dimas, United States",34.106676,-117.806726
6,"Coishco, Peru",-9.028711,-78.617740
7,"Romblon, Philippines",12.500000,122.250000
8,"Arraial do Cabo, Brazil",-22.966284,-42.024427
9,"Uravakonda, India",14.872214,77.262068


### 2.2 Get Venues

Prepare requests to Foursquare Places API

In [217]:
CLIENT_ID = 'C30IS2UE2KULOT4PMMQ24TZQWFKP0Y4PLPKI4NNQ2X2CBZIT' # your Foursquare ID
CLIENT_SECRET = 'OYIZN0VZFPOKH4G3SFHUHL2AIKPZRSR4CPV4X5HW4F5NTN1Q' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: C30IS2UE2KULOT4PMMQ24TZQWFKP0Y4PLPKI4NNQ2X2CBZIT
CLIENT_SECRET:OYIZN0VZFPOKH4G3SFHUHL2AIKPZRSR4CPV4X5HW4F5NTN1Q


In [218]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        try:
            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        except:
            print(name + " not found")

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Place', 
                  'Place Latitude', 
                  'Place Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Retrieve venues for visited and new places:

In [219]:
LIMIT = 100

visited_venues = getNearbyVenues(names=dfVisited['Place'],
                                   latitudes=dfVisited['Latitude'],
                                   longitudes=dfVisited['Longitude']
                                  )

new_venues = getNearbyVenues(names=dfNew['Place'],
                                   latitudes=dfNew['Latitude'],
                                   longitudes=dfNew['Longitude']
                                  )

Seoul, Korea
New York City, USA
Elmalı, Turkey
Caxito, Angola
Montauban, France
Kaeng Khro, Thailand
Boysun, Uzbekistan
San Dimas, United States
Coishco, Peru
Romblon, Philippines
Arraial do Cabo, Brazil
Uravakonda, India
Wazīrābād, Pakistan
Isehara, Japan
Iwai, Japan
Brockville, Canada
Zomba, Malawi
Xoxocotla, Mexico
Rembertów, Poland
Hirekerūr, India
Bharatpur, Nepal
Magnitogorsk, Russia
Lexington, United States
Westchester, United States
Bélabo, Cameroon
Baja, Hungary
Mainz, Germany
Kazan, Turkey
Batman, Turkey
Broomfield, United States
Savelugu, Ghana
Guaíra, Brazil
Reigate, United Kingdom
Linhares, Brazil
Rakvere, Estonia
Fort Gloster, India
Sarajevo, Bosnia and Herzegovina
Araraquara, Brazil
North Babylon, United States
Bagalkot, India
Sidlaghatta, India
Ihiala, Nigeria
Calumpit, Philippines
Djamaa, Algeria
Bishop Auckland, United Kingdom
Tulun, Russia
Kashihara, Japan
Sydney, Australia
Ōzu, Japan
Sigmaringen, Germany
Depok, Indonesia
Pompano Beach, United States
Chokwé, Mozambiq

## 3. Feature Transformation

### 3.1 One Hot Encoding of Venue Categories

In [220]:
# one hot encoding
visited_onehot = pd.get_dummies(visited_venues[['Venue Category']], prefix="", prefix_sep="")

visited_onehot.insert(0, 'Place', visited_venues['Place'] , True) 

visited_onehot

Unnamed: 0,Place,American Restaurant,Antique Shop,Art Gallery,Art Museum,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bar,...,Sushi Restaurant,Taco Place,Tea Room,Theater,Tourist Information Center,Vegetarian / Vegan Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Seoul, Korea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Seoul, Korea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Seoul, Korea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Seoul, Korea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Seoul, Korea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,"Seoul, Korea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,"Seoul, Korea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,"Seoul, Korea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,"Seoul, Korea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,"Seoul, Korea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [221]:
new_onehot = pd.get_dummies(new_venues[['Venue Category']], prefix="", prefix_sep="")

new_onehot.insert(0, 'Place', new_venues['Place'] , True) 

new_onehot

Unnamed: 0,Place,ATM,Acai House,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,...,Turkish Coffeehouse,Turkish Home Cooking Restaurant,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Store,Wagashi Place,Water Park,Wine Bar,Wine Shop,Winery
0,"Montauban, France",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Montauban, France",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Montauban, France",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Montauban, France",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Montauban, France",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,"Montauban, France",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,"Montauban, France",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,"Boysun, Uzbekistan",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,"San Dimas, United States",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,"San Dimas, United States",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3.2 Calculating the Frequency of Venue Categories

In [222]:
visited_onehot = visited_onehot.loc[:,~visited_onehot.columns.duplicated()]

visited_grouped = visited_onehot.groupby('Place').mean().reset_index()
visited_grouped

Unnamed: 0,Place,American Restaurant,Antique Shop,Art Gallery,Art Museum,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bar,...,Sushi Restaurant,Taco Place,Tea Room,Theater,Tourist Information Center,Vegetarian / Vegan Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"New York City, USA",0.02,0.01,0.01,0.0,0.0,0.01,0.01,0.01,0.02,...,0.01,0.01,0.0,0.0,0.0,0.01,0.01,0.02,0.01,0.02
1,"Seoul, Korea",0.0,0.0,0.0,0.03,0.01,0.0,0.01,0.02,0.0,...,0.01,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0


In [223]:
new_onehot = new_onehot.loc[:,~new_onehot.columns.duplicated()]

new_grouped = new_onehot.groupby('Place').mean().reset_index()
new_grouped

Unnamed: 0,Place,ATM,Acai House,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,...,Turkish Coffeehouse,Turkish Home Cooking Restaurant,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Store,Wagashi Place,Water Park,Wine Bar,Wine Shop,Winery
0,"Araraquara, Brazil",0.000000,0.021277,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.021277,0.021277,0.000000,0.00,0.000000,0.000000,0.021277
1,"Aringay, Philippines",0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
2,"Arraial do Cabo, Brazil",0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
3,"Baja, Hungary",0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.052632,0.000000,0.000000
4,"Bharatpur, Nepal",0.000000,0.000000,0.000000,0.0,0.5,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
5,"Bishop Auckland, United Kingdom",0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
6,"Bloomingdale, United States",0.000000,0.000000,0.000000,0.0,0.0,0.050000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
7,"Bogotol, Russia",0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
8,"Boysun, Uzbekistan",0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
9,"Brockville, Canada",0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000


## 4. K-Means Clustering 

### 4.1 Calculate Cosine Similarity Between New and Visited Places

In [224]:
visited_grouped_numerical = visited_grouped.drop("Place", 1)
new_grouped_numerical = new_grouped.drop("Place", 1)

new_grouped_numerical = new_grouped_numerical.loc[:,visited_grouped_numerical.columns]
new_grouped_numerical = new_grouped_numerical.fillna(0.0)

dfSimilarity = pd.DataFrame(data=cosine_similarity(new_grouped_numerical, visited_grouped_numerical),columns=visited)

print(dfSimilarity)

    Seoul, Korea  New York City, USA
0       0.337350            0.161301
1       0.113228            0.082385
2       0.288906            0.235435
3       0.323575            0.235435
4       0.000000            0.000000
..           ...                 ...
69      0.000000            0.047565
70      0.130744            0.000000
71      0.254238            0.302703
72      0.320513            0.055970
73      0.175412            0.297805

[74 rows x 2 columns]


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


### 4.2 Calculate Optimal K Using Silhouette Method

In [225]:
scores = []
kmax = 10

for k in range(2, kmax+1):
  kmeans = KMeans(n_clusters = k).fit(dfSimilarity)
  labels = kmeans.labels_
  scores.append(silhouette_score(dfSimilarity, labels, metric = 'euclidean'))

scores

[0.5020454321220941,
 0.42480863696203264,
 0.40643788773737993,
 0.41563282900239734,
 0.4153932306911086,
 0.4164377589317636,
 0.4289493302931211,
 0.47224848956761084,
 0.46072110410272216]

In [226]:
opt_k = scores.index(max(scores))+2

opt_k

1

### 4.3 Run K-Means

In [227]:
# run k-means clustering
kmeans = KMeans(n_clusters=opt_k, random_state=0).fit(dfSimilarity)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [228]:
new_clustered = new_grouped.loc[:,["Place"]]
new_clustered.insert(1, 'Cluster Labels', kmeans.labels_)

new_clustered

Unnamed: 0,Place,Cluster Labels
0,"Araraquara, Brazil",0
1,"Aringay, Philippines",0
2,"Arraial do Cabo, Brazil",0
3,"Baja, Hungary",0
4,"Bharatpur, Nepal",0
5,"Bishop Auckland, United Kingdom",0
6,"Bloomingdale, United States",0
7,"Bogotol, Russia",0
8,"Boysun, Uzbekistan",0
9,"Brockville, Canada",0


## 5. Visualization

### 5.1 Prepare Data for Visualization

Merge cities data with cluster labels:

In [229]:
dfNew_clustered = pd.merge(dfNew, new_clustered, on='Place', how="inner")

dfNew_clustered.sort_values(by="Cluster Labels")

Unnamed: 0,Place,Latitude,Longitude,Cluster Labels
0,"Montauban, France",44.017584,1.354999,0
52,"La Victoria, Venezuela",10.227684,-67.324636,0
51,"Catalão, Brazil",-18.170255,-47.944708,0
50,"Hachinohe, Japan",40.512239,141.488296,0
49,"Aringay, Philippines",16.395717,120.355381,0
48,"Liberty, United States",30.085674,-94.785626,0
47,"Warnes, Bolivia",-17.510082,-63.164718,0
53,"Pecangaan, Indonesia",-6.692985,111.231073,0
46,"Torrelavega, Spain",43.348730,-4.051508,0
44,"Kempston, United Kingdom",52.119201,-0.493015,0


### 5.2 Draw Map with Clustered Cities

In [230]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=1)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dfNew_clustered['Latitude'], dfNew_clustered['Longitude'], dfNew_clustered['Place'], dfNew_clustered['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 6. Analysis of Clusters

In [231]:
dfSimilarity_clustered = dfSimilarity.copy()
dfSimilarity_clustered.insert(0, 'Cluster Labels', kmeans.labels_)

dfMeans = pd.DataFrame(columns=['Seoul, Korea', 'New York City, USA'])
dfMeans['Seoul, Korea'] = dfSimilarity_clustered.groupby(['Cluster Labels'])['Seoul, Korea'].mean()
dfMeans['New York City, USA'] = dfSimilarity_clustered.groupby(['Cluster Labels'])['New York City, USA'].mean()

dfMeans

Unnamed: 0_level_0,"Seoul, Korea","New York City, USA"
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.221021,0.160717
