# What hotel features influenced reviews?

## Extract data

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import sqlite3

In [2]:
# Connect to the database
conn = sqlite3.connect("Data/Hotels.db")

In [3]:
# Load the metadata2 table
meta = pd.read_sql("select * from metadata2;", conn)

# Load the ratings table
rate = pd.read_sql_query("select * from ratings;", conn)

In [4]:
# Close the connection
conn.close()

In [5]:
# Preview metadata table
meta.head()

Unnamed: 0,Name,Street,City,province,latitude,longitude,State,Stories,stars,airportDistance_km,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,Rancho Valencia Resort Spa,5921 Valencia Cir,Rancho Santa Fe,CA,32.990959000000004,-117.186136,California,,4.0,14.30884805537358,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aloft Arundel Mills,7520 Teague Rd,Hanover,MD,39.155929,-76.716341,Maryland,7.0,4.0,4.668331572785505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Hampton Inn Suites PortlandVancouver,315 SE Olympia Dr,Vancouver,WA,45.619212,-122.525196,Washington,4.0,,6.591900084053486,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,Hotel Phillips,106 W 12th St,Kansas City,MO,39.100119,-94.584701,Missouri,20.0,4.0,2.6706451419692976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,The Inn at Solvang,10611 Standing Stone Rd,Huntingdon,PA,40.527478,-77.969763,Pennsylvania,,,3.781816947263244,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Drop columns from the rate table
rate = rate.drop(columns = ["index", "reviews_userCity", "reviews_userProvince", 
                            "reviews_text", "reviews_title", "reviews_sourceURLs"])

# Change the "name" to "Name"
rate = rate.rename(columns = {"name": "Name"})

# Convert review date from string to datetime format
rate["reviews_date"] = pd.to_datetime(rate["reviews_date"])

# Preview ratings table
rate.head()

Unnamed: 0,Name,reviews_date,reviews_rating
0,Rancho Valencia Resort Spa,2013-11-14,5.0
1,Rancho Valencia Resort Spa,2014-07-06,5.0
2,Rancho Valencia Resort Spa,2015-01-02,5.0
3,Aloft Arundel Mills,2016-05-15,2.0
4,Aloft Arundel Mills,2016-07-09,5.0


## Transform data
In preparation for correlation analyses

In [8]:
# Merge the two dataframes
df = pd.merge(rate, meta, on = ["Name"])

# Convert "nan" to np.nan
df.replace("nan", np.nan, inplace = True)

# Remove the rows with null values
df = df.dropna()

# Remove columns that have been converted to dummy variables, not needed in analyses
df = df.drop(columns = ["Name", "Street", "City", "province", 
                        "latitude", "longitude", "State"])

# Preview the data
df.head()

Unnamed: 0,reviews_date,reviews_rating,Stories,stars,airportDistance_km,airport,apartment,attractions,bars,beach,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
3,2016-05-15,2.0,7,4.0,4.668331572785505,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2016-07-09,5.0,7,4.0,4.668331572785505,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2016-06-11,5.0,7,4.0,4.668331572785505,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2016-04-30,5.0,7,4.0,4.668331572785505,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2016-06-24,5.0,7,4.0,4.668331572785505,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Visualisation analysis