# Regression Analyses

In [1]:
# Dependencies
import pandas as pd
import sqlite3

In [2]:
# Connect to a sqlite database
conn = sqlite3.connect("Data/Hotels.db")

# Get the data from alldata table
alldata = pd.read_sql_query("select * from alldata;", conn)
conn.close()

# Preview the dataframe
alldata.head()

Unnamed: 0,Name,reviews_rating,airportDistance_km,airport,apartment,attractions,bars,beach,boutique,breakfast,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Rancho Valencia Resort Spa,5.0,14.30884805537358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,Rancho Valencia Resort Spa,5.0,14.30884805537358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
2,Rancho Valencia Resort Spa,5.0,14.30884805537358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
3,Aloft Arundel Mills,2.0,4.668331572785505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,Aloft Arundel Mills,5.0,4.668331572785505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
# Convert hotel features and locations from string to float
col_headers = list(alldata.columns)[2:]

alldata[col_headers] = alldata[col_headers].apply(pd.to_numeric, errors = "coerce", axis = 1)

# Convert ratings, hotel features, locations, years from float to int
col_headers2 = list(alldata.columns)[1:]

alldata[col_headers2] = alldata[col_headers2].astype(int)

In [4]:
df = alldata.groupby("Name").median()

In [5]:
df.head()

Unnamed: 0_level_0,reviews_rating,airportDistance_km,airport,apartment,attractions,bars,beach,boutique,breakfast,cabins,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1906 Lodge At Coronado Beach,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
250 Main Hotel,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
AC Hotel Chicago Downtown,4.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
AC Hotel Miami Beach,5.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
AC Hotel by Marriott Boston Downtown,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Scale the data

In [6]:
# Dependencies
import mord
from sklearn.preprocessing import StandardScaler

In [7]:
# Define the response (y) and the explanatory (X) variables
X = df.drop(columns = ["reviews_rating"], axis = 1)
y = df["reviews_rating"]

# Standardise the explanatory variables
scaler = StandardScaler()
X_standard = scaler.fit_transform(X)

# Convert review rating into whole number
y = y.astype(int)

print(X_standard.shape, y.shape)

(1670, 124) (1670,)


## Random Forest

In [8]:
# Dependencies
from sklearn.ensemble import RandomForestRegressor

In [9]:
# Create a random forest regressor (for continuous explanatory variables, like scaled values)
rf = RandomForestRegressor(n_estimators = 500)
rf_reg = rf.fit(X_standard, y)

# Get the coefficient of determination (R^2) of the random forest prediction
rf_reg.score(X_standard, y)

0.7609482640290511

In [10]:
# Variable importance
impt = rf_reg.feature_importances_
impt_var = sorted(zip(impt, list(X.columns)), reverse = True)[0:10]
impt_var

[(0.2886009928895506, 'airportDistance_km'),
 (0.038692331337632516, 'reservations'),
 (0.0369509053205664, '2016'),
 (0.029404370342577285, '2014'),
 (0.028614491238164782, 'CA'),
 (0.025616321303478275, '2015'),
 (0.021399453209301332, 'family-friendly'),
 (0.020204840798728984, '2013'),
 (0.020066079459542908, 'FL'),
 (0.018432942737231048, 'TX')]

## Split the data into training and test sets

In [11]:
# Dependencies
from sklearn.model_selection import train_test_split

In [12]:
# Create a list of the top 10 important variables
top10 = [var[1] for var in impt_var]

In [13]:
# Create a list of the top 10 important variables
top10 = [var[1] for var in impt_var]

# Choose variables with high importance
X2 = X[top10]

X2_standard = scaler.fit_transform(X2)

print(X2_standard.shape, y.shape)

(1670, 10) (1670,)


In [14]:
# Divide the data into training and testing data
Xstd_train, Xstd_test, y_train, y_test = train_test_split(X2_standard, y, random_state = 42, stratify = y)

print(Xstd_train.shape, Xstd_test.shape)

(1252, 10) (418, 10)


## Regression Models

In [15]:
models = ["multinomial logistic regression", "ridge regression", "ordinal logistic regression (IT)", 
          "ordinal logistic regression (AT)", "lasso"]

mean_abs_error = []
mean_acc_test = []
mean_acc_train = []
R2 = []

In [16]:
# Dependencies
import mord
from sklearn.linear_model import LogisticRegression, RidgeCV, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### Model 1: Multinomial logistic regression

In [17]:
# Create a multinomial logistic regression model
classifier1 = LogisticRegression(solver = "lbfgs", multi_class = "multinomial")

# Fit the model to the data
classifier1.fit(Xstd_train, y_train)

# Make predictions
pred = classifier1.predict(Xstd_test)

# Evaluate the model
score_logistic = mean_absolute_error(y_test, pred)
mean_abs_error.append(score_logistic)

# Mean accuracy scores
mean_acc_test.append(classifier1.score(Xstd_test, y_test))
mean_acc_train.append(classifier1.score(Xstd_train, y_train))

# Coefficient of determination (R^2)
r2 = r2_score(y_test, pred)
R2.append(r2)

### Model 2: Ridge regression

In [18]:
# Create a ridge regression with three possible alphas
regression_cv = RidgeCV(alphas = [0.1, 1.0, 10.0])

# Fit the ridge regression model
model_cv = regression_cv.fit(Xstd_train, y_train)

# View the best alpha value
model_cv.alpha_

10.0

In [19]:
# Use the best alpha value in the ridge regression model
classifier2 = Ridge(alpha = 10.0)

# Fit the model to the data
classifier2.fit(Xstd_train, y_train)

# Make predictions
pred2 = classifier2.predict(Xstd_test)

# Evaluate the model
score_ridge = mean_absolute_error(y_test, pred2)
mean_abs_error.append(score_ridge)

# Mean accuracy scores
mean_acc_test.append(classifier2.score(Xstd_test, y_test))
mean_acc_train.append(classifier2.score(Xstd_train, y_train))

# Coefficient of determination (R^2)
r2 = r2_score(y_test, pred2)
R2.append(r2)

### Model 3: Ordinal logistic regression

In [20]:
# Create an ordinal logistic regression model (immediate-threshold)
classifier3 = mord.LogisticIT(alpha = 1.0)

# Fit the model to the data
classifier3.fit(Xstd_train, y_train)

# Make predictions
pred3 = classifier3.predict(Xstd_test)

# Evaluate the model
score_logisticIT = mean_absolute_error(y_test, pred3)
mean_abs_error.append(score_logisticIT)

# Mean accuracy scores
mean_acc_test.append(classifier3.score(Xstd_test, y_test))
mean_acc_train.append(classifier3.score(Xstd_train, y_train))

# Coefficient of determination (R^2)
r2 = r2_score(y_test, pred3)
R2.append(r2)

### Model 4: Ordinal logistic regression

In [21]:
# Create an ordinal logistic regression model (all-threshold)
classifier4 = mord.LogisticAT(alpha = 1.0)

# Fit the model to the data
classifier4.fit(Xstd_train, y_train)

# Make predictions
pred4 = classifier4.predict(Xstd_test)

# Evaluate the model
score_logisticAT = mean_absolute_error(y_test, pred4)
mean_abs_error.append(score_logisticAT)

# Mean accuracy scores
mean_acc_test.append(classifier4.score(Xstd_test, y_test))
mean_acc_train.append(classifier4.score(Xstd_train, y_train))

# Coefficient of determination (R^2)
r2 = r2_score(y_test, pred4)
R2.append(r2)

### Model 5: Lasso Regression

In [22]:
# Use the best alpha value in the lasso regression model
classifier5 = Lasso(alpha = 1.0)

# Fit the model to the data
classifier5.fit(Xstd_train, y_train)

# Make predictions
pred5 = classifier2.predict(Xstd_test)

# Evaluate the model
score_lasso = mean_absolute_error(y_test, pred5)
mean_abs_error.append(score_lasso)

# Mean accuracy scores
mean_acc_test.append(classifier5.score(Xstd_test, y_test))
mean_acc_train.append(classifier5.score(Xstd_train, y_train))

# Coefficient of determination (R^2)
r2 = r2_score(y_test, pred5)
R2.append(r2)

In [23]:
# Model accuracy
df_acc = pd.DataFrame({"Model": models, "MAE": mean_abs_error, "Train Mean Accuracy": mean_acc_train, 
                       "Test Mean Accuracy": mean_acc_test, "R^2": R2})

df_acc.head()

Unnamed: 0,Model,MAE,Train Mean Accuracy,Test Mean Accuracy,R^2
0,multinomial logistic regression,0.947368,0.412939,0.380383,-0.500998
1,ridge regression,0.85113,0.066106,0.035554,0.035554
2,ordinal logistic regression (IT),0.916268,0.403355,0.394737,-0.428549
3,ordinal logistic regression (AT),0.837321,-0.806709,-0.837321,-0.055157
4,lasso,0.85113,0.0,-1.7e-05,0.035554


In [24]:
# Model predictions
df_pred = pd.DataFrame({"Actual": y_test, "MLR": pred, "RidgeReg": pred2, "OLR_AT": pred3, 
                        "OLR_IT": pred4, "Lasso": pred5})

df_pred.head()

Unnamed: 0_level_0,Actual,MLR,RidgeReg,OLR_AT,OLR_IT,Lasso
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Best Western Orange Plaza,4,5,3.98578,5,4,3.98578
"The Roosevelt New Orleans, A Waldorf Astoria Hotel",4,4,3.978756,5,4,3.978756
America's Best Value Inn,1,5,3.916102,5,4,3.916102
Stonebrook Lodge,4,5,4.275194,5,5,4.275194
Hampton Inn Suites West Des Moines/SW Mall Area,5,5,4.248342,5,5,4.248342
