# Loading more information about the hotels

## Extract data from the metadata table in the sqlite database

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import sqlite3
from IPython.core.display import clear_output

import nltk
nltk.download("punkt")
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rochiecuevas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Create a connection to the database
conn = sqlite3.connect("Data/Hotels.db")

# Load the database table into a pandas dataframe
metadata = pd.read_sql_query("select * from metadata;", conn)
conn.close()

In [3]:
# Create a dictionary of state names and their abbreviations
state_abbr = sorted(list(set(metadata["province"])))
state_name = ["Alaska", "Arkansas", "Arizona", "California", "Colorado", "Connecticut", "Delaware", "Florida",
              "Georgia", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky",
              "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri",
              "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire",
              "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", 
              "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas",
              "Utah", "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

state_dict = dict(zip(state_abbr, state_name))

# Add a column containing state names
metadata["State"] = metadata["province"].map(state_dict)

# Reformat the title for the name column
metadata = metadata.rename(columns={'name': 'Name', 'address': 'Street', 'city': 'City'})

# Preview the dataframe
metadata.head()

Unnamed: 0,index,Name,categories,primaryCategories,Street,City,province,latitude,longitude,websites,State
0,0,Rancho Valencia Resort Spa,"Hotels,Hotels and motels,Hotel and motel reser...",Accommodation & Food Services,5921 Valencia Cir,Rancho Santa Fe,CA,32.990959,-117.186136,http://www.ranchovalencia.com,California
1,3,Aloft Arundel Mills,"Hotels,Hotels and motels,Travel agencies and b...",Accommodation & Food Services,7520 Teague Rd,Hanover,MD,39.155929,-76.716341,http://www.starwoodhotels.com/alofthotels/prop...,Maryland
2,9,Hampton Inn Suites PortlandVancouver,"Hotels,Hotels and motels,Hotel and motel reser...",Accommodation & Food Services,315 SE Olympia Dr,Vancouver,WA,45.619212,-122.525196,http://hamptoninn3.hilton.com/en/hotels/washin...,Washington
3,15,Hotel Phillips,"Hotels,Caterers,Hotels and motels,Hotel,Restau...",Accommodation & Food Services,106 W 12th St,Kansas City,MO,39.100119,-94.584701,http://curiocollection3.hilton.com/en/hotels/m...,Missouri
4,20,The Inn at Solvang,"Bed Breakfasts,Bed Breakfast,Hotels Motels,Hotel",Accommodation & Food Services,10611 Standing Stone Rd,Huntingdon,PA,40.527478,-77.969763,http://www.solvang.com,Pennsylvania


## Add hotel characteristics

### Number of floors

In [4]:
# Load data from FEMA containing number of floors
df_floors = pd.read_csv("Data/national_list.csv")

# Remove columns not important for the purposes of the study
df_floors = df_floors.drop(columns = ["FEMA ID", "P.O. Box", "ZIP code", "Phone", "Property Type", "Sprinklers"])

# Preview the dataframe
df_floors.head()

Unnamed: 0,Name,Street,City,State,Stories
0,The Orchards,222 Adams Rd,Williamstown,Massachusetts,2
1,Courtyard By Marriott Fitchburg,150 Royal Plaza Dr,Fitchburg,Massachusetts,6
2,Four Points Hotel,99 Erdman Way,Leominster,Massachusetts,7
3,Crowne Plaza Worcester,10 Lincoln Sq,Worcester,Massachusetts,9
4,Bedford Plaza Hotel - Boston,340 Great Rd,Bedford,Massachusetts,3


In [12]:
# Merge the two dataframes based on the names and cities in the metadata
df3 = pd.merge(metadata, df_floors, on = ["City", "Street", "State"], how = "left")

# Remove columns not going to be used
df3 = df3.drop(columns = ["index", "websites", "primaryCategories"], axis = 1)

# Drop duplicate rows
df3 = df3.drop_duplicates(subset = ["Name_x", "City"], keep = "first", inplace = False)

1868

### Hotel stars

In [None]:
# Load hotel data into a dataframe
df_hotels = pd.read_csv("Data/hotels.csv")

# Remove the columns that are not needed
df_hotels = df_hotels.drop(columns = ["price", "countryCode", "location", 
                                      "url", "latitude", "longitude"], axis = 1)

# Rename columns
df_hotels.rename(columns = {"hotelName": "Name", "cityName": "City", "address": "Street"}, inplace = True)

# Filter for hotels in the USA
df_hotels = df_hotels.loc[df_hotels["countryName"] == "United States"]
df_hotels = df_hotels.drop(columns = ["countryName"], axis = 1)
df_hotels.head()

In [None]:
df4 = pd.merge(df3, df_hotels, on = ["Name"], how = "left")
df4.loc[21:40]

In [None]:
df4["Name"][177]

In [None]:
df4.loc[df4["City_x"].str.contains("Volcano")]

#### Distance from airport

In [None]:
# Get latitudes and longitudes of airports mapped in the USA
# Source: https://opendata.socrata.com/dataset/Airport-Codes-mapped-to-Latitude-Longitude-in-the-/rxrh-4cxm
path = "Data/Airport_Codes_Coords_USA.csv"

airports = pd.read_csv(path)
airports.head()

In [None]:
print(f"Number of airports mapped in USA: {airports.shape[0]}")

In [None]:
# Haversine formula to calculate distance
# Source1: https://stackoverflow.com/a/41337005
# Source2: https://stackoverflow.com/a/21623206
from math import cos, asin, sqrt

def distance(lat1, lon1, lat2, lon2):
    """distance is expressed in km"""
    p = 0.017453292519943295 # pi/180; factor to convert degrees to radians
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
    return 12742 * asin(sqrt(a)) # Earth diameter: 12742 = 2 * R; R = 6371km (mean radius of the earth)

# Find the minimum distance between the hotel and the closest airport
def min_distance(lat, lon):
    distances = []
    for i in range(0, len(airports)):
        away = distance(abs(lat), abs(lon), abs(airports["Latitude"][i]), abs(airports["Longitude"][i]))
        distances.append(away)
    return min(distances)

In [None]:
# For each hotel coordinate, calculate the distance to the nearest airport
airport_distance = []
for i in range(0, len(metadata)):
    dist = min_distance(metadata["latitude"][i], metadata["longitude"][i])
    airport_distance.append(dist)
    
    print(f"Now processing {i}th airport.\n-----")
    clear_output(wait = True) # to replace output with new one (instead of printing many outputs)

In [None]:
# Add airport distance in metadata
metadata["airportDistance_km"] = airport_distance

# Preview the dataframe
metadata.head()

### Hotel features

In [None]:
# Tokenise the categories and put in a new column
cats = []
for row in metadata["categories"]:
    tokens = word_tokenize(row)
    cats.append(tokens)

cats2 = []
for cat in cats:
    x = []
    for word in cat:
        x.append(word.lower())
    cats2.append(x)
metadata["categories2"] = cats2

In [None]:
metadata.head()

In [None]:
# Create a list of unique words to chose features from
word_list = []
for row in metadata["categories2"]:
    for word in row:
        word_list.append(word)
        
unique_terms = list(set(word_list))
unique_terms

In [None]:
# Create new columns containing categories/features for the hotels (manually pick from unique terms)
features = ['resort', 'movie', 'marina', 'harbor', 'reception',
           'convention', 'health', 'hall', 'extended', 'lodges',
           'chalets', 'cemetery', 'skiing', 'theater', 'campground',
           'entertainment', 'clinics', 'cabins', 'parties', 'lodge', 
           'nightclub', 'services', 'airport', 'spas', 'hotel', 'pools',
           'attractions', 'meeting', 'utility', 'condominiums', 'cable',
           'office', 'village', 'fashion', 'loft', 'chapels', 'fairgrounds', 
           'boutique', 'gym', 'motel', 'bars', 'e-commerce',
           'golf', 'apartment', 'medical', 'pubs', 'cottages', 'pet', 'lakeview',
           'restaurant', 'wedding', 'fitness', 'recreation', 'receptions', 
           'reservations', 'casino', 'family-friendly', 'breakfast',
           'beach', 'karaoke']

# Sort the items in the list alphabetically
features.sort()

for feat in features:
    metadata[feat] = np.nan

In [None]:
# Create a function that fills in 1s and 0s for selected categories
def Cat_encoding(category):
    for i in range(0,len(metadata)):
        if category not in metadata["categories2"][i]:
            metadata[str(category)][i] = "0"
        else:
            metadata[str(category)][i] = "1" 
        
        print(f"Now processing {i}th hotel for {category}.\n-----")
        clear_output(wait = True) # to replace output with new one (instead of printing many outputs)  

In [None]:
# Loop the Cat_encoding function to fill the empty categories
for x in features:
    Cat_encoding(x) 

In [None]:
# Create a list of unique provinces
prov_list = list(set(metadata["province"]))

# Sort the items in the list alphabetically
prov_list.sort()

for prov in prov_list:
    metadata[prov] = np.nan

In [None]:
# Create a function that fills in 1s and 0s for selected categories
def Prov_encoding(province):
    for i in range(0,len(metadata)):
        if province not in metadata["province"][i]:
            metadata[str(province)][i] = "0"
        else:
            metadata[str(province)][i] = "1" 
        
        print(f"Now processing {i}th hotel for {province}.\n-----")
        clear_output(wait = True) # to replace output with new one (instead of printing many outputs)  

In [None]:
# Loop the Prov_encoding function to fill the empty provinces
for x in prov_list:
    Prov_encoding(x) 

In [None]:
metadata.head()

## Load the metadata with new information into a new database table 

In [None]:
# Create a list of columns
columns = list(metadata.columns)

In [None]:
# Create a function that converts the values into strings
def to_str(column_name):
    metadata[column_name] = metadata[column_name].astype(str)

In [None]:
# Loop through all columns and convert their values into strings
for column in columns:
    to_str(column) 

In [None]:
# Drop columns from metadata df
metadata = metadata.drop(columns = ["categories", "primaryCategories", "websites", "categories2"])
metadata.head()

In [None]:
# Create a connection to the database
conn = sqlite3.connect("Data/Hotels.db")

# Save the dataframe as a sqlite database table
metadata.to_sql("metadata2", conn, if_exists = "replace", index = False)

In [None]:
# Preview metadata
pd.read_sql_query("select * from metadata2 limit 3;", conn)

In [None]:
# Close the connection
conn.close()