In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

<h2>Viewing and Manipulating Data</h2>

In [2]:
#Loading the dataset
hotel_file = 'DataFiles/city_hotel_features.txt'
features_file = 'DataFiles/features.txt'
hotel = pd.read_csv(hotel_file, delimiter='\t')
hotel.head()

Unnamed: 0,Hotel_Name,City_Name,Features
0,Tanner's,atlanta,100 253 250 178 174 063 059 036 008 074 204 05...
1,Frijoleros,atlanta,250 062 132 174 063 197 071 142 234 243 075 20...
2,Indian Delights,atlanta,253 250 150 174 083 059 036 117 243 076 205 05...
3,Great Wall,atlanta,253 191 192 174 036 039 075 204 052 163
4,The Brickery,atlanta,100 253 086 231 250 191 192 059 036 215 005 00...


In [3]:
#COnverting the features column into dummy variables
hotel_dummies = hotel.Features.str.get_dummies(sep=' ')
hotel = pd.concat([hotel,hotel_dummies],axis=1)
hotel.drop('Features',axis=1,inplace=True)
hotel.head()

Unnamed: 0,Hotel_Name,City_Name,000,001,002,003,004,005,006,007,...,246,247,248,249,250,251,252,253,254,256
0,Tanner's,atlanta,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,Frijoleros,atlanta,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Indian Delights,atlanta,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,Great Wall,atlanta,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,The Brickery,atlanta,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0


<h2>Cleaning Data</h2>

In [4]:
#Grouping the same hotel names in a city and replacing their feature value to 1(incase it is 2)
hotel = hotel.groupby(['Hotel_Name','City_Name'],as_index=False).sum()
hotel.replace(to_replace=2,value=1,inplace=True)

#Identifying the list of duplicate hotel names. (One hotel can be in diff cities with diff features)
duplicates_hotel_names = list(hotel[hotel.duplicated('Hotel_Name')]['Hotel_Name'])

In [5]:
#Function reposnsible for changing the HotelNames of duplicate hotels
def modifyDuplicateHotelNames(hotel_copy,hotel_duplicate_names):
    if(str(hotel_copy['Hotel_Name']) in hotel_duplicate_names):
        hotel_name = str(hotel_copy['Hotel_Name']) + '(' + str(hotel_copy['City_Name']) + ')'
    else:
        hotel_name = hotel_copy['Hotel_Name']
    return hotel_name

In [6]:
hotel['Hotel_Name'] = hotel.apply(modifyDuplicateHotelNames,args=([duplicates_hotel_names]),
                                            axis=1)

In [7]:
#Verifying if there any duplicates now
hotel_copy_duplicates = hotel[hotel.duplicated('Hotel_Name')]
hotel_copy_duplicates

Unnamed: 0,Hotel_Name,City_Name,000,001,002,003,004,005,006,007,...,246,247,248,249,250,251,252,253,254,256


<h2>Looks like we are Good to go now </h2>

In [8]:
#Before we do any analysis, lets export the cleaned data in to a csv so that we could use the CSV
#and don't have to reinvent the wheel everytime
hotel.to_csv('Hotel_Cleaned.csv')

In [9]:
#This function gets the vector of the hotel name passed and converts it into a Numpy array
#It also call other function named calculateSimilarityScore 
def findSimilarHotels(hotel_df,hotel_name):
    hotel_vector = hotel_df[hotel_df.Hotel_Name == hotel_name]
    user_preferences = hotel_vector.values[0,2:]
    
    return calculateSimilarityScore(hotel_df,user_preferences, 10)

In [10]:
#Returns a matrix dot product of 2 arrays/vectors
def getHotelScore(features, user_preferences):  
    return np.dot(features,user_preferences)

In [11]:
#Compares the given hotel vector with all the hotels and assigns a score to all hotels
def calculateSimilarityScore(hotel_df,user_preferences, n_recommendations):
    #List of all column features in the dataset
    features = hotel_df.columns[2:]
    
    hotel_df['score'] = hotel_df[features].apply(getHotelScore, 
                                                           args=([user_preferences]), axis=1)
    return hotel_df.sort_values(by=['score'], ascending=False)[['Hotel_Name','score']][1:n_recommendations]

In [12]:
findSimilarHotels(hotel,"Frijoleros")
#The score column basically indicates the no of features they have in common. We'll be recommending Hotels based on highest
#no of common features a hotel has with a given hotel

Unnamed: 0,Hotel_Name,score
3645,Taco Mac,11
3874,Tortilla's,9
1355,Fellini's Pizza,9
1996,Kramerbooks Cafe,9
2199,Lawrence's Cafe,8
1112,De Palma's,8
1346,Fat Matt's Rib Shack,8
1724,Huey's,7
3313,SUNSET CAFE,7


<h2>That's it! We are Done. </h2>