# Regression Analyses

In [1]:
# Dependencies
import pandas as pd
import sqlite3

In [2]:
# Connect to a sqlite database
conn = sqlite3.connect("Data/Hotels.db")

# Get the data from alldata table
alldata = pd.read_sql_query("select * from alldata;", conn)
conn.close()

# Preview the dataframe
alldata.head()

Unnamed: 0,reviews_date,reviews_rating,year,province,Stories,stars,airportDistance_km,airport,apartment,attractions,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,2016-05-15 00:00:00,2,2016,MD,7,4,4.668332,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2016-07-09 00:00:00,5,2016,MD,7,4,4.668332,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2016-06-11 00:00:00,5,2016,MD,7,4,4.668332,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2016-04-30 00:00:00,5,2016,MD,7,4,4.668332,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2016-06-24 00:00:00,5,2016,MD,7,4,4.668332,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Convert airport distance to int
alldata["airportDistance_km"] = alldata["airportDistance_km"].astype(int)
alldata["year"] = alldata["year"].astype(int)
alldata.dtypes

reviews_date          object
reviews_rating         int64
year                   int64
province              object
Stories                int64
stars                  int64
airportDistance_km     int64
airport                int64
apartment              int64
attractions            int64
bars                   int64
beach                  int64
boutique               int64
breakfast              int64
cabins                 int64
cable                  int64
campground             int64
casino                 int64
cemetery               int64
chalets                int64
chapels                int64
clinics                int64
condominiums           int64
convention             int64
cottages               int64
e-commerce             int64
entertainment          int64
extended               int64
fairgrounds            int64
family-friendly        int64
                       ...  
MA                     int64
MD                     int64
ME                     int64
MI            

In [4]:
alldata.head()

Unnamed: 0,reviews_date,reviews_rating,year,province,Stories,stars,airportDistance_km,airport,apartment,attractions,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,2016-05-15 00:00:00,2,2016,MD,7,4,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2016-07-09 00:00:00,5,2016,MD,7,4,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2016-06-11 00:00:00,5,2016,MD,7,4,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2016-04-30 00:00:00,5,2016,MD,7,4,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2016-06-24 00:00:00,5,2016,MD,7,4,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Logistic Regression Model

In [5]:
# Dependencies
import mord
from sklearn import linear_model, metrics, preprocessing, model_selection

In [6]:
# Define the response (y) and the explanatory (X) variables
X = alldata.drop(columns = ["reviews_date", "reviews_rating", "province"], axis = 1)
y = alldata["reviews_rating"]

# Standardise the explanatory variables
scaler = preprocessing.StandardScaler()
X_standard = scaler.fit_transform(X)

print(X.shape, y.shape)

(6572, 110) (6572,)


In [7]:
# Divide the data into training and testing data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state = 42, stratify = y)

print(X_train.shape, X_test.shape)

(4929, 110) (1643, 110)


In [8]:
models = ["multinomial logistic regression", "ridge regression", 
          "ordinal logistic regression (IT)", "ordinal logistic regression (AT)"]

mean_abs_error = []

### Model 1: Multinomial logistic regression

In [9]:
# Create a multinomial logistic regression model
classifier1 = linear_model.LogisticRegression(
                solver = "lbfgs",
                multi_class = "multinomial")

# Fit the model to the data
classifier1.fit(X_train, y_train)

# Make predictions
pred = classifier1.predict(X_test)

# Evaluate the model
score_logistic = metrics.mean_absolute_error(y_test, pred)
mean_abs_error.append(score_logistic)

### Model 2: Ridge regression

In [10]:
# Create a ridge regression with three possible alphas
regression_cv = linear_model.RidgeCV(alphas = [0.1, 1.0, 10.0])

# Fit the ridge regression model
model_cv = regression_cv.fit(X_train, y_train)

# View the best alpha value
model_cv.alpha_

1.0

In [11]:
# Use the best alpha value in the ridge regression model
classifier2 = linear_model.Ridge(alpha = 1.0)

# Fit the model to the data
classifier2.fit(X_train, y_train)

# Make predictions
pred2 = classifier2.predict(X_test)

# Evaluate the model
score_ridge = metrics.mean_absolute_error(y_test, pred2)
mean_abs_error.append(score_ridge)

### Model 3: Ordinal logistic regression

In [12]:
# Create an ordinal logistic regression model (immediate-threshold)
classifier3 = mord.LogisticIT(alpha = 1.0)

# Fit the model to the data
classifier3.fit(X_train, y_train)

# Make predictions
pred3 = classifier3.predict(X_test)

# Evaluate the model
score_logisticIT = metrics.mean_absolute_error(y_test, pred3)
mean_abs_error.append(score_logisticIT)

### Model 4: Ordinal logistic regression

In [13]:
# Create an ordinal logistic regression model (all-threshold)
classifier4 = mord.LogisticAT(alpha = 1.0)

# Fit the model to the data
classifier4.fit(X_train, y_train)

# Make predictions
pred4 = classifier4.predict(X_test)

# Evaluate the model
score_logisticAT = metrics.mean_absolute_error(y_test, pred4)
mean_abs_error.append(score_logisticAT)

In [14]:
# Compare mean absolute errors
print(models)
print(mean_abs_error)

['multinomial logistic regression', 'ridge regression', 'ordinal logistic regression (IT)', 'ordinal logistic regression (AT)']
[1.1424223980523434, 0.9809335766653533, 1.0827754108338405, 0.9500912964090079]


In [15]:
models_comp = dict(zip(models, mean_abs_error))
pd.DataFrame(list(models_comp.items()), columns = ["models", "mean_abs_error"])

Unnamed: 0,models,mean_abs_error
0,multinomial logistic regression,1.142422
1,ridge regression,0.980934
2,ordinal logistic regression (IT),1.082775
3,ordinal logistic regression (AT),0.950091
