# Loading more information about the hotels

## Extract data from the metadata table in the sqlite database

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import sqlite3
from IPython.core.display import clear_output

import nltk
nltk.download("punkt")
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rochiecuevas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Create a connection to the database
conn = sqlite3.connect("Data/Hotels.db")

# Load the database table into a pandas dataframe
metadata = pd.read_sql_query("select * from metadata;", conn)
conn.close()

In [3]:
# Create a dictionary of state names and their abbreviations
state_abbr = sorted(list(set(metadata["province"])))
state_name = ["Alaska", "Arkansas", "Arizona", "California", "Colorado", "Connecticut", "Delaware", "Florida",
              "Georgia", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky",
              "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri",
              "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire",
              "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", 
              "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas",
              "Utah", "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

state_dict = dict(zip(state_abbr, state_name))

# Add a column containing state names
metadata["State"] = metadata["province"].map(state_dict)

# Reformat the title for the name column
metadata = metadata.rename(columns={'name': 'Name', 'address': 'Street', 'city': 'City'})

# Preview the dataframe
metadata.head()

Unnamed: 0,index,Name,categories,primaryCategories,Street,City,province,latitude,longitude,websites,State
0,0,Rancho Valencia Resort Spa,"Hotels,Hotels and motels,Hotel and motel reser...",Accommodation & Food Services,5921 Valencia Cir,Rancho Santa Fe,CA,32.990959,-117.186136,http://www.ranchovalencia.com,California
1,3,Aloft Arundel Mills,"Hotels,Hotels and motels,Travel agencies and b...",Accommodation & Food Services,7520 Teague Rd,Hanover,MD,39.155929,-76.716341,http://www.starwoodhotels.com/alofthotels/prop...,Maryland
2,9,Hampton Inn Suites PortlandVancouver,"Hotels,Hotels and motels,Hotel and motel reser...",Accommodation & Food Services,315 SE Olympia Dr,Vancouver,WA,45.619212,-122.525196,http://hamptoninn3.hilton.com/en/hotels/washin...,Washington
3,15,Hotel Phillips,"Hotels,Caterers,Hotels and motels,Hotel,Restau...",Accommodation & Food Services,106 W 12th St,Kansas City,MO,39.100119,-94.584701,http://curiocollection3.hilton.com/en/hotels/m...,Missouri
4,20,The Inn at Solvang,"Bed Breakfasts,Bed Breakfast,Hotels Motels,Hotel",Accommodation & Food Services,10611 Standing Stone Rd,Huntingdon,PA,40.527478,-77.969763,http://www.solvang.com,Pennsylvania


## Add hotel characteristics

### Number of floors

In [4]:
# Load data from FEMA containing number of floors
df_floors = pd.read_csv("Data/national_list.csv")

# Remove columns not important for the purposes of the study
df_floors = df_floors.drop(columns = ["FEMA ID", "P.O. Box", "ZIP code", "Phone", "Property Type", "Sprinklers"])

# Preview the dataframe
df_floors.head()

Unnamed: 0,Name,Street,City,State,Stories
0,The Orchards,222 Adams Rd,Williamstown,Massachusetts,2
1,Courtyard By Marriott Fitchburg,150 Royal Plaza Dr,Fitchburg,Massachusetts,6
2,Four Points Hotel,99 Erdman Way,Leominster,Massachusetts,7
3,Crowne Plaza Worcester,10 Lincoln Sq,Worcester,Massachusetts,9
4,Bedford Plaza Hotel - Boston,340 Great Rd,Bedford,Massachusetts,3


In [5]:
# Merge the two dataframes based on the names and cities in the metadata
df3 = pd.merge(metadata, df_floors, on = ["City", "Street", "State"], how = "left")

# Remove columns not going to be used
df3 = df3.drop(columns = ["index", "websites", "primaryCategories"], axis = 1)

# Drop duplicate rows
df3 = df3.drop_duplicates(subset = ["Name_x", "City"], keep = "first", inplace = False)

# Rename columns
df3.rename(columns = {"Name_x": "Name"}, inplace = True)

df3.head()

Unnamed: 0,Name,categories,Street,City,province,latitude,longitude,State,Name_y,Stories
0,Rancho Valencia Resort Spa,"Hotels,Hotels and motels,Hotel and motel reser...",5921 Valencia Cir,Rancho Santa Fe,CA,32.990959,-117.186136,California,,
1,Aloft Arundel Mills,"Hotels,Hotels and motels,Travel agencies and b...",7520 Teague Rd,Hanover,MD,39.155929,-76.716341,Maryland,Aloft Arundel Mills,7.0
2,Hampton Inn Suites PortlandVancouver,"Hotels,Hotels and motels,Hotel and motel reser...",315 SE Olympia Dr,Vancouver,WA,45.619212,-122.525196,Washington,Hampton Inn Suites PortlandVancouver,4.0
3,Hotel Phillips,"Hotels,Caterers,Hotels and motels,Hotel,Restau...",106 W 12th St,Kansas City,MO,39.100119,-94.584701,Missouri,Hotel Phillips,20.0
4,The Inn at Solvang,"Bed Breakfasts,Bed Breakfast,Hotels Motels,Hotel",10611 Standing Stone Rd,Huntingdon,PA,40.527478,-77.969763,Pennsylvania,,


### Hotel stars

In [6]:
# Load hotel data into a dataframe
df_hotels = pd.read_csv("Data/hotels.csv")

# Remove the columns that are not needed
df_hotels = df_hotels.drop(columns = ["price", "countryCode", "location", 
                                      "url", "latitude", "longitude"], axis = 1)

# Rename columns
df_hotels.rename(columns = {"hotelName": "Name", "cityName": "City", "address": "Street"}, inplace = True)

# Filter for hotels in the USA
df_hotels = df_hotels.loc[df_hotels["countryName"] == "United States"]
df_hotels = df_hotels.drop(columns = ["countryName"], axis = 1)
df_hotels.head()

Unnamed: 0,Name,stars,City,Street
0,Kona Village Resort Kailua Kona,4.0,Kailua Kona,Queen Kaahumanu Highway
1,Aarons Cottage,2.0,Hilo,54 Keokea Loop Rd
2,Aloha Crater Lodge,3.0,Volcano,11-3966 Lanihuli Rd P.O.Box 92
3,Aloha Place,2.5,Volcano,19-3820 Old Volcano Rd
4,Artist Cottage at Volcano Garden Arts,3.5,Volcano,19-3834 Old Volcano Rd


In [7]:
# Merge the df_hotels and the df dataframes
df4 = pd.merge(df3, df_hotels, on = ["Name"], how = "left")

# Drop the redundant columns
df4 = df4.drop(columns = ["Name_y", "City_y", "Street_y"], axis = 1)

# Rename the street and the city columns
df4 = df4.rename(columns = {"Street_x": "Street", "City_x": "City"})

# Preview the new dataframe
df4.head()

Unnamed: 0,Name,categories,Street,City,province,latitude,longitude,State,Stories,stars
0,Rancho Valencia Resort Spa,"Hotels,Hotels and motels,Hotel and motel reser...",5921 Valencia Cir,Rancho Santa Fe,CA,32.990959,-117.186136,California,,4.0
1,Aloft Arundel Mills,"Hotels,Hotels and motels,Travel agencies and b...",7520 Teague Rd,Hanover,MD,39.155929,-76.716341,Maryland,7.0,4.0
2,Hampton Inn Suites PortlandVancouver,"Hotels,Hotels and motels,Hotel and motel reser...",315 SE Olympia Dr,Vancouver,WA,45.619212,-122.525196,Washington,4.0,
3,Hotel Phillips,"Hotels,Caterers,Hotels and motels,Hotel,Restau...",106 W 12th St,Kansas City,MO,39.100119,-94.584701,Missouri,20.0,4.0
4,The Inn at Solvang,"Bed Breakfasts,Bed Breakfast,Hotels Motels,Hotel",10611 Standing Stone Rd,Huntingdon,PA,40.527478,-77.969763,Pennsylvania,,


### Distance from airport

In [8]:
# Dependencies for calculating distances on Earth (sphere) using haversine formula
# Source2: https://stackoverflow.com/a/21623206
from math import cos, asin, sqrt, pi

def distance(lat1, lon1, lat2, lon2):
    """distance is expressed in km"""
    p = pi/180 #factor to convert degrees to radians
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
    return 12742 * asin(sqrt(a)) # Earth diameter: 12742 = 2 * R; R = 6371km (mean radius of the earth)

# Find the minimum distance between the hotel and the closest airport
def min_distance(lat, lon):
    distances = []
    for i in range(0, len(airports)):
        away = distance(abs(lat), abs(lon), abs(airports["Latitude"][i]), abs(airports["Longitude"][i]))
        distances.append(away)
    return min(distances)

In [9]:
# Get latitudes and longitudes of airports mapped in the USA
# Source: https://opendata.socrata.com/dataset/Airport-Codes-mapped-to-Latitude-Longitude-in-the-/rxrh-4cxm
path = "Data/Airport_Codes_Coords_USA.csv"

airports = pd.read_csv(path)
airports.head()

Unnamed: 0,locationID,Latitude,Longitude
0,ADK,51.8781,176.6461
1,AKK,56.9386,154.1825
2,Z13,60.9047,161.4225
3,AKI,60.9028,161.2306
4,AUK,62.68,164.66


In [10]:
print(f"Number of airports mapped in USA: {airports.shape[0]}")

Number of airports mapped in USA: 13429


In [11]:
# For each hotel coordinate, calculate the distance to the nearest airport
airport_distance = []
for i in range(0, len(df4)):
    dist = min_distance(df4["latitude"][i], df4["longitude"][i])
    airport_distance.append(dist)
    
    print(f"Now processing {i}th airport.\n-----")
    clear_output(wait = True) # to replace output with new one (instead of printing many outputs)

Now processing 1852th airport.
-----


In [12]:
# Add airport distance in df4
df4["airportDistance_km"] = airport_distance

# Preview the dataframe
df4.head()

Unnamed: 0,Name,categories,Street,City,province,latitude,longitude,State,Stories,stars,airportDistance_km
0,Rancho Valencia Resort Spa,"Hotels,Hotels and motels,Hotel and motel reser...",5921 Valencia Cir,Rancho Santa Fe,CA,32.990959,-117.186136,California,,4.0,14.308848
1,Aloft Arundel Mills,"Hotels,Hotels and motels,Travel agencies and b...",7520 Teague Rd,Hanover,MD,39.155929,-76.716341,Maryland,7.0,4.0,4.668332
2,Hampton Inn Suites PortlandVancouver,"Hotels,Hotels and motels,Hotel and motel reser...",315 SE Olympia Dr,Vancouver,WA,45.619212,-122.525196,Washington,4.0,,6.5919
3,Hotel Phillips,"Hotels,Caterers,Hotels and motels,Hotel,Restau...",106 W 12th St,Kansas City,MO,39.100119,-94.584701,Missouri,20.0,4.0,2.670645
4,The Inn at Solvang,"Bed Breakfasts,Bed Breakfast,Hotels Motels,Hotel",10611 Standing Stone Rd,Huntingdon,PA,40.527478,-77.969763,Pennsylvania,,,3.781817


### Hotel features

In [13]:
# Tokenise the categories and put in a new column
cats = []
for row in df4["categories"]:
    tokens = word_tokenize(row)
    cats.append(tokens)

cats2 = []
for cat in cats:
    x = []
    for word in cat:
        x.append(word.lower())
    cats2.append(x)
df4["categories2"] = cats2

In [14]:
df4.head()

Unnamed: 0,Name,categories,Street,City,province,latitude,longitude,State,Stories,stars,airportDistance_km,categories2
0,Rancho Valencia Resort Spa,"Hotels,Hotels and motels,Hotel and motel reser...",5921 Valencia Cir,Rancho Santa Fe,CA,32.990959,-117.186136,California,,4.0,14.308848,"[hotels, ,, hotels, and, motels, ,, hotel, and..."
1,Aloft Arundel Mills,"Hotels,Hotels and motels,Travel agencies and b...",7520 Teague Rd,Hanover,MD,39.155929,-76.716341,Maryland,7.0,4.0,4.668332,"[hotels, ,, hotels, and, motels, ,, travel, ag..."
2,Hampton Inn Suites PortlandVancouver,"Hotels,Hotels and motels,Hotel and motel reser...",315 SE Olympia Dr,Vancouver,WA,45.619212,-122.525196,Washington,4.0,,6.5919,"[hotels, ,, hotels, and, motels, ,, hotel, and..."
3,Hotel Phillips,"Hotels,Caterers,Hotels and motels,Hotel,Restau...",106 W 12th St,Kansas City,MO,39.100119,-94.584701,Missouri,20.0,4.0,2.670645,"[hotels, ,, caterers, ,, hotels, and, motels, ..."
4,The Inn at Solvang,"Bed Breakfasts,Bed Breakfast,Hotels Motels,Hotel",10611 Standing Stone Rd,Huntingdon,PA,40.527478,-77.969763,Pennsylvania,,,3.781817,"[bed, breakfasts, ,, bed, breakfast, ,, hotels..."


In [15]:
# Create a list of unique words to chose features from
word_list = []
for row in df4["categories2"]:
    for word in row:
        word_list.append(word)
        
unique_terms = list(set(word_list))
unique_terms

['alternative',
 'caterers',
 'courses',
 'resort',
 'health',
 'angeles',
 'bull',
 'fairgrounds',
 'cottages',
 'guest',
 'hall',
 'harvard',
 'trade',
 'resorts',
 'valley',
 'lodges',
 'wedding',
 'night',
 'lounge',
 'pet',
 'dale',
 'perimeter',
 'fishing',
 'mgmt.',
 'moultonborough',
 'weekend',
 'fitness',
 'arrangers',
 'building',
 'garden',
 'swimming',
 'nightclub',
 'hotel',
 'lounges',
 'planning',
 'side',
 'consultants',
 'studio',
 'centers',
 'uptown-galleria',
 'services',
 'northwest',
 'stay',
 'attractions',
 'southside',
 'durham',
 'casino',
 'contractors',
 'lodge',
 'house',
 'hills',
 'pool',
 'fashion',
 '-',
 'marina',
 'homes',
 'family-friendly',
 'b',
 'north',
 'arbor',
 'los',
 '(',
 'parking',
 'chapels',
 'ak',
 'transport',
 'motels',
 'service',
 'nightlife',
 'day',
 'ranches',
 'breakfasts',
 'budget',
 'facilities',
 'rentals',
 'gyms',
 'chelsea',
 'dealers',
 'holding',
 'executive',
 'airports',
 'beer',
 'packages',
 'fuels',
 'campgrounds'

In [16]:
# Create new columns containing categories/features for the hotels (manually pick from unique terms)
features = ['resort', 'movie', 'marina', 'harbor', 'reception',
           'convention', 'health', 'hall', 'extended', 'lodges',
           'chalets', 'cemetery', 'skiing', 'theater', 'campground',
           'entertainment', 'clinics', 'cabins', 'parties', 'lodge', 
           'nightclub', 'services', 'airport', 'spas', 'hotel', 'pools',
           'attractions', 'meeting', 'utility', 'condominiums', 'cable',
           'office', 'village', 'fashion', 'loft', 'chapels', 'fairgrounds', 
           'boutique', 'gym', 'motel', 'bars', 'e-commerce',
           'golf', 'apartment', 'medical', 'pubs', 'cottages', 'pet', 'lakeview',
           'restaurant', 'wedding', 'fitness', 'recreation', 'receptions', 
           'reservations', 'casino', 'family-friendly', 'breakfast',
           'beach', 'karaoke']

# Sort the items in the list alphabetically
features.sort()

for feat in features:
    df4[feat] = np.nan

In [17]:
# Create a function that fills in 1s and 0s for selected categories
def Cat_encoding(category):
    for i in range(0,len(df4)):
        if category not in df4["categories2"][i]:
            df4[str(category)][i] = "0"
        else:
            df4[str(category)][i] = "1" 
        
        print(f"Now processing {i}th hotel for {category}.\n-----")
        clear_output(wait = True) # to replace output with new one (instead of printing many outputs)  

In [18]:
# Loop the Cat_encoding function to fill the empty categories
for x in features:
    Cat_encoding(x) 

Now processing 1852th hotel for wedding.
-----


In [19]:
# Create a list of unique provinces
prov_list = list(set(df4["province"]))

# Sort the items in the list alphabetically
prov_list.sort()

for prov in prov_list:
    df4[prov] = np.nan

In [20]:
# Create a function that fills in 1s and 0s for selected categories
def Prov_encoding(province):
    for i in range(0,len(df4)):
        if province not in df4["province"][i]:
            df4[str(province)][i] = "0"
        else:
            df4[str(province)][i] = "1" 
        
        print(f"Now processing {i}th hotel for {province}.\n-----")
        clear_output(wait = True) # to replace output with new one (instead of printing many outputs)  

In [21]:
# Loop the Prov_encoding function to fill the empty provinces
for x in prov_list:
    Prov_encoding(x) 

Now processing 1852th hotel for WY.
-----


In [22]:
df4.head()

Unnamed: 0,Name,categories,Street,City,province,latitude,longitude,State,Stories,stars,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,Rancho Valencia Resort Spa,"Hotels,Hotels and motels,Hotel and motel reser...",5921 Valencia Cir,Rancho Santa Fe,CA,32.990959,-117.186136,California,,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aloft Arundel Mills,"Hotels,Hotels and motels,Travel agencies and b...",7520 Teague Rd,Hanover,MD,39.155929,-76.716341,Maryland,7.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Hampton Inn Suites PortlandVancouver,"Hotels,Hotels and motels,Hotel and motel reser...",315 SE Olympia Dr,Vancouver,WA,45.619212,-122.525196,Washington,4.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,Hotel Phillips,"Hotels,Caterers,Hotels and motels,Hotel,Restau...",106 W 12th St,Kansas City,MO,39.100119,-94.584701,Missouri,20.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,The Inn at Solvang,"Bed Breakfasts,Bed Breakfast,Hotels Motels,Hotel",10611 Standing Stone Rd,Huntingdon,PA,40.527478,-77.969763,Pennsylvania,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Load the df4 dataframe with new information into a new database table 

In [23]:
# Create a list of columns
columns = list(df4.columns)

In [24]:
# Create a function that converts the values into strings
def to_str(column_name):
    df4[column_name] = df4[column_name].astype(str)

In [25]:
# Loop through all columns and convert their values into strings
for column in columns:
    to_str(column) 

In [27]:
# Drop columns from df4
df4 = df4.drop(columns = ["categories", "categories2"])
df4.head()

Unnamed: 0,Name,Street,City,province,latitude,longitude,State,Stories,stars,airportDistance_km,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,Rancho Valencia Resort Spa,5921 Valencia Cir,Rancho Santa Fe,CA,32.990959000000004,-117.186136,California,,4.0,14.30884805537358,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aloft Arundel Mills,7520 Teague Rd,Hanover,MD,39.155929,-76.716341,Maryland,7.0,4.0,4.668331572785505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Hampton Inn Suites PortlandVancouver,315 SE Olympia Dr,Vancouver,WA,45.619212,-122.525196,Washington,4.0,,6.591900084053486,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,Hotel Phillips,106 W 12th St,Kansas City,MO,39.100119,-94.584701,Missouri,20.0,4.0,2.6706451419692976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,The Inn at Solvang,10611 Standing Stone Rd,Huntingdon,PA,40.527478,-77.969763,Pennsylvania,,,3.781816947263244,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Create a connection to the database
conn = sqlite3.connect("Data/Hotels.db")

# Save the dataframe as a sqlite database table
df4.to_sql("metadata2", conn, if_exists = "replace", index = False)

In [32]:
# Preview metadata
pd.read_sql_query("select * from metadata2 limit 3;", conn)

Unnamed: 0,Name,Street,City,province,latitude,longitude,State,Stories,stars,airportDistance_km,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,Rancho Valencia Resort Spa,5921 Valencia Cir,Rancho Santa Fe,CA,32.990959000000004,-117.186136,California,,4.0,14.30884805537358,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aloft Arundel Mills,7520 Teague Rd,Hanover,MD,39.155929,-76.716341,Maryland,7.0,4.0,4.668331572785505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Hampton Inn Suites PortlandVancouver,315 SE Olympia Dr,Vancouver,WA,45.619212,-122.525196,Washington,4.0,,6.591900084053486,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
# Close the connection
conn.close()