# Loading more information about the hotels

## Extract data from the metadata table in the sqlite database

In [None]:
# Dependencies
import pandas as pd
import numpy as np
import sqlite3
from IPython.core.display import clear_output

In [None]:
# Create a connection to the database
conn = sqlite3.connect("Data/Hotels.db")

# Load the database table into a pandas dataframe
metadata = pd.read_sql_query("select * from metadata;", conn)
conn.close()

# Preview the dataframe
metadata.head()

## Add hotel characteristics

### Distance from airport

In [None]:
# Get latitudes and longitudes of airports mapped in the USA
# Source: https://opendata.socrata.com/dataset/Airport-Codes-mapped-to-Latitude-Longitude-in-the-/rxrh-4cxm
path = "Data/Airport_Codes_Coords_USA.csv"

airports = pd.read_csv(path)
airports.head()

In [None]:
print(f"Number of airports mapped in USA: {airports.shape[0]}")

In [None]:
# Haversine formula to calculate distance
# Source1: https://stackoverflow.com/a/41337005
# Source2: https://stackoverflow.com/a/21623206
from math import cos, asin, sqrt

def distance(lat1, lon1, lat2, lon2):
    """distance is expressed in km"""
    p = 0.017453292519943295 # pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
    return 12742 * asin(sqrt(a)) # Earth diameter: 12742 = 2 * R; R = 6371km (mean radius of the earth)

# Find the minimum distance between the hotel and the closest airport
def min_distance(lat, lon):
    distances = []
    for i in range(0, len(airports)):
        away = distance(abs(lat), abs(lon), abs(airports["Latitude"][i]), abs(airports["Longitude"][i]))
        distances.append(away)
    return min(distances)

In [None]:
# For each hotel coordinate, calculate the distance to the nearest airport
airport_distance = []
for i in range(0, len(metadata)):
    dist = min_distance(metadata["latitude"][i], metadata["longitude"][i])
    airport_distance.append(dist)
    
    print(f"Now processing {i}th airport.\n-----")
    clear_output(wait = True) # to replace output with new one (instead of printing many outputs)

In [None]:
# Add airport distance in metadata
metadata["airportDistance_km"] = airport_distance

# Preview the dataframe
metadata.head()

### Hotel features

In [None]:
# Create new columns containing categories for the hotels
new_col = ["Motel", "Cottage", "Cabin", "Hotel", "Caterer", "Resort", "Restaurant", "Bed", "Spa", "Banquet", 
           "Concierge Service", "Golf Course", "Cable Internet", "Pool", "Water Parks", "Family-Friendly", "Casino", 
           "Beach", "Luxury Hotels", "Business Hotels", "Conference Room", "Event Space", "Convention", 
           "Boutique Hotels", "Clinics", "Inn", "Concert Hall", "E-Commerce", "Extended Stay", "Fairgrounds", 
           "Harbor", "Marina", "Lounge", "Medical", "Movie", "Pet Friendly", "Ski", "Timeshare", "Yacht clubs", 
           "Apartment"]

# Sort the items in the list alphabetically
new_col.sort()

for col in new_col:
    metadata[col] = np.nan

In [None]:
# Create a function that fills in 1s and 0s for selected categories
def Cat_encoding(category):
    for i in range(0,len(metadata)):
        if category not in metadata["primaryCategories"][i]:
            metadata[str(category)][i] = 0
        else:
            metadata[str(category)][i] = 1 
        
        print(f"Now processing {i}th hotel for {category}.\n-----")
        clear_output(wait = True) # to replace output with new one (instead of printing many outputs)  

In [None]:
# Loop the primaryCat_encoding function to fill the empty categories
for x in new_col:
    Cat_encoding(x) 

In [None]:
metadata.head()