In [1]:
import pandas as pd
import os
import csv
import numpy as np

Pre-processing:

https://www.kaggle.com/datasets/vijeetnigam26/expedia-hotel

In [2]:
# test contains real data, train contains train data

In [3]:
g = open(os.getcwd() + "/train.csv", "r")
train = pd.read_csv(g)
# train.head()

In [4]:
f = open(os.getcwd() + "/test.csv", "r")
test = pd.read_csv(f)
# test.head()

Dealing with missing values; dropping all rows that do not contain our target variable - comp1_rate:

In [5]:
test_original = test.copy()
train_original = train.copy()

In [6]:
test.dropna(subset=["comp1_rate"], inplace=True)
train.dropna(subset=["comp1_rate"], inplace=True)

Target Variable:
    comp1_rate
        - does Expedia offer lower rates than its competitors
        
Explanatory Variables:
    srch_saturday_night_bool
             - +1 if the stay includes a Saturday night, starts from Thursday with a length of stay is less than or equal to 4 nights (i.e. weekend); otherwise 0
         
    prop_location_score1:
        - A (first) score outlining the desirability of a hotel’s location
        
    prop_brand_bool:
        - +1 if the hotel is part of a major hotel chain; 0 if it is an independent hotel   
        
        
    
    

In [7]:
# train[["comp1_rate", "srch_saturday_night_bool", "prop_location_score1", "prop_brand_bool"]]

In [10]:
train["comp1_rate"].value_counts()

 1.0    132077
 0.0     84558
-1.0     19171
Name: comp1_rate, dtype: int64

In [13]:
test["comp1_rate"].value_counts()

 1.0    88654
 0.0    54011
-1.0    12399
Name: comp1_rate, dtype: int64

Transforming the "comp1_rate" column to be a binary classificaiton of whether Expedia offers a better price or not:

In [14]:
def adjustRate(row):
    if row["comp1_rate"] == 1.0:
        return 1
    else:
        return 0

In [16]:
train["comp1_rate"] = train.apply(adjustRate, axis=1)
test["comp1_rate"] = test.apply(adjustRate, axis=1)

In [17]:
train["comp1_rate"].value_counts()

1    132077
0    103729
Name: comp1_rate, dtype: int64

In [18]:
test["comp1_rate"].value_counts()

1    88654
0    66410
Name: comp1_rate, dtype: int64

Creating a new column with binary classification as to whether a location is considered "good":

In [19]:
def aboveOrBelowTrain(row):
    if row["prop_location_score1"] >= train["prop_location_score1"].median():
        return 1
    else:
        return 0

In [20]:
def aboveOrBelowTest(row):
    if row["prop_location_score1"] >= test["prop_location_score1"].median():
        return 1
    else:
        return 0

In [21]:
train["isGoodLocation"] = train.apply(aboveOrBelowTrain, axis=1)

In [22]:
test["isGoodLocation"] = test.apply(aboveOrBelowTest, axis=1)

Model Building - Logistic Regression:

In [23]:
predictors = ["srch_saturday_night_bool", "prop_location_score1", "prop_brand_bool", "isGoodLocation"]
X_train = train[predictors].values
X_test = test[predictors].values
y_train = train['comp1_rate'].values
X_train[:5]
y_train[:5] 

array([0, 1, 1, 0, 1])

In [63]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [64]:
model.fit(X_train, y_train)

LogisticRegression()

In [66]:
y_predict = model.predict(X_test)

In [67]:
y_predict[:10]

array([0., 0., 1., 0., 0., 0., 1., 1., 1., 1.])

In [69]:
model.coef_
model.intercept_

array([-0.88207346,  0.43931554,  0.44275793])

In [73]:
np.exp(model.coef_)


array([[1.01619323, 0.99964727, 0.68480751, 0.85126533],
       [1.00836223, 0.92716642, 1.8643696 , 1.00080368],
       [0.97590408, 1.0789356 , 0.78324832, 1.17377854]])

In [74]:
predictors

['srch_saturday_night_bool',
 'prop_location_score1',
 'prop_brand_bool',
 'isGoodLocation']

Crossvalidation:
* cv = number of folds
* Cs = list of floats or int with the inverse of regularization strength. smaller values specify stronger regularization
* scoring =  A string or a scorer (see sklearn.metrics). The default scoring option used is ‘accuracy’.

In [75]:
from sklearn.linear_model import LogisticRegressionCV
listC = list(np.power(10.0, np.arange(-4, 4)))
model_cv = LogisticRegressionCV(cv=10,Cs=listC,scoring = "accuracy" )
model_cv.fit(X_train, y_train)

LogisticRegressionCV(Cs=[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
                     cv=10, scoring='accuracy')

In [76]:
model_cv.scores_

np.average(model_cv.scores_[1])

0.5907737781732888

In [77]:
model_cv.score(X_train,y_train)

0.5925633783703553

In [78]:
# we try 20 different values for C in the logarithmic scale between 1e-4 and 1e4 (Cs = numpy.logspace(-4, 4, num=20))
model_cv = LogisticRegressionCV(cv=10, Cs=listC)
model_cv.fit(X_train, y_train)
# check which value of C was selected
print(model_cv.C_) 


[0.001 0.001 0.001]


In [80]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
# define grid
parametergrid= {"criterion" : ("gini", "entropy"),"max_depth":(1,2,3,4,5,6,7,8,9,10),"min_samples_leaf":(1,2,3,4,5,6,7,8,9,10)
}
# random_state ensures replicability
clf = GridSearchCV(DecisionTreeClassifier(random_state=0), parametergrid)
#this may take a while
clf.fit(X_train, y_train)
# returns the best model
print(clf.best_estimator_)
# the best parameters
print(clf.best_params_)
# results obtained by the best model
print(clf.best_score_)

DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=4,
                       random_state=0)
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4}
0.6271850478713653
