In [1]:
### imports
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import LabelBinarizer, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier

### Preprocessing

In [2]:
hotel_df = pd.read_csv('Hotel Reservations.csv')
hotel_df.drop(['Booking_ID'], inplace=True, axis=1)

In [3]:
categorical_vars = ['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'market_segment_type']
numerical_vars = ['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights', 'lead_time', 'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 'no_of_special_requests']
time_vars = ['arrival_year', 'arrival_month', 'arrival_date']

excluded_vars = ['arrival_year']

included_categorical_vars = [var for var in categorical_vars if var not in excluded_vars]
included_numerical_vars = [var for var in numerical_vars if var not in excluded_vars]
included_time_vars = [var for var in time_vars if var not in excluded_vars]

X = hotel_df.drop(['booking_status'] + excluded_vars, axis=1)
y = hotel_df['booking_status']

y_bin = LabelBinarizer().fit_transform(y)

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size=0.2, random_state=92)

### Neural Network

In [4]:
### load model
model_nn = tf.keras.models.load_model('nn_final.h5', compile=False)
model_nn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), 
                loss=tf.keras.losses.binary_crossentropy, 
                metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy')])

In [5]:
### load feature transformer
scaler = pickle.load(open('scaler.pkl', 'rb'))

In [6]:
def nn_predict(x):
    '''
    Takes an input, x, and uses the neural network to predict if the hotel
    reservation is cancelled.
    
    ------------
    Input
    ------------
    The input must be an (n x 16) numpy array or an (n x 16) subset of a named DataFrame.

    If the input is a numpy array, then order of the variables must be:

    'type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'market_segment_type', 'no_of_adults', 'no_of_children', 
    'no_of_weekend_nights', 'no_of_week_nights', 'lead_time', 'repeated_guest', 'no_of_previous_cancellations', 
    'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 'no_of_special_requests', 'arrival_month', 'arrival_date'

    ------------
    Output
    ------------
    The output is either 0 or 1.
    0: 'Cancelled'
    1: 'Not_Cancelled'
    
    '''
    ### transform input
    x_scaled = scaler.transform(x)

    ### make prediction
    threshold = 0.5
    pred = model_nn.predict(x_scaled)
    pred = np.where(pred >= threshold, 1, 0)

    return pred

In [7]:
### determine predicted values
nn_train_pred = nn_predict(X_train)
nn_test_pred = nn_predict(X_test)

### accuracy
nn_train_acc = accuracy_score(y_train, nn_train_pred)
nn_test_acc = accuracy_score(y_test, nn_test_pred)
print(f"Training Accuracy:  {100*nn_train_acc:.2f}%")
print(f"Testing Accuracy:  {100*nn_test_acc:.2f}%")
print(classification_report(nn_train_pred, y_train))

Training Accuracy:  88.58%
Testing Accuracy:  87.17%
              precision    recall  f1-score   support

           0       0.73      0.90      0.81      7734
           1       0.96      0.88      0.92     21286

    accuracy                           0.89     29020
   macro avg       0.85      0.89      0.86     29020
weighted avg       0.90      0.89      0.89     29020



### Naive Bayes

In [8]:
###############################
### Naive Bayes Categorical ###
###############################

oe = OrdinalEncoder()
X_nb_cat_scaled = oe.fit_transform(X[included_categorical_vars])
clf = CategoricalNB()
clf.fit(X_nb_cat_scaled, y_bin)


def NB_cat(x):
    x_scaled = oe.transform(x[included_categorical_vars])
    pred = clf.predict(x_scaled)
    pred = pred.reshape(-1, 1)
    return pred

### determine predicted values
nb_cat_train_pred = NB_cat(X_train)
nb_cat_test_pred = NB_cat(X_test)

### accuracy
nb_cat_train_acc = accuracy_score(y_train, nb_cat_train_pred)
nb_cat_test_acc = accuracy_score(y_test, nb_cat_test_pred)
print(f"Training Accuracy:  {100*nb_cat_train_acc:.2f}%")
print(f"Testing Accuracy:  {100*nb_cat_test_acc:.2f}%")
print(classification_report(nb_cat_train_pred, y_train))

Training Accuracy:  67.15%
Testing Accuracy:  67.33%
              precision    recall  f1-score   support

           0       0.01      0.46      0.02       208
           1       0.99      0.67      0.80     28812

    accuracy                           0.67     29020
   macro avg       0.50      0.56      0.41     29020
weighted avg       0.99      0.67      0.80     29020



  y = column_or_1d(y, warn=True)


In [9]:
#############################
### Naive Bayes Numerical ###
#############################

scaler_nb_num = StandardScaler()
X_nb_num_scaled = scaler_nb_num.fit_transform(X[included_numerical_vars])
clf_num = GaussianNB()
clf_num.fit(X_nb_num_scaled, y_bin)

def NB_num(x):
    x_scaled = scaler_nb_num.transform(x[included_numerical_vars])
    pred = clf_num.predict(x_scaled)
    pred = pred.reshape(-1, 1)
    return pred

### determine predicted values
nb_num_train_pred = NB_num(X_train)
nb_num_test_pred = NB_num(X_test)

### accuracy
nb_num_train_acc = accuracy_score(y_train, nb_num_train_pred)
nb_num_test_acc = accuracy_score(y_test, nb_num_test_pred)
print(f"Training Accuracy:  {100*nb_num_train_acc:.2f}%")
print(f"Testing Accuracy:  {100*nb_num_test_acc:.2f}%")
print(classification_report(nb_num_train_pred, y_train))

Training Accuracy:  37.95%
Testing Accuracy:  38.00%
              precision    recall  f1-score   support

           0       1.00      0.35      0.51     27500
           1       0.08      0.99      0.14      1520

    accuracy                           0.38     29020
   macro avg       0.54      0.67      0.33     29020
weighted avg       0.95      0.38      0.49     29020



  y = column_or_1d(y, warn=True)


### Decision Tree

In [10]:
dt = DecisionTreeClassifier()
X_dt_scaled = scaler.transform(X)
dt.fit(X_dt_scaled, y_bin)

def DecisionTree(x):
    x_scaled = scaler.transform(x)
    pred = dt.predict(x_scaled)
    pred = pred.reshape(-1, 1)
    return pred

### determine predicted values
dt_train_pred = DecisionTree(X_train)
dt_test_pred = DecisionTree(X_test)
### accuracy

dt_train_acc = accuracy_score(y_train, dt_train_pred)
dt_test_acc = accuracy_score(y_test, dt_test_pred)
print(f"Training Accuracy:  {100*dt_train_acc:.2f}%")
print(f"Testing Accuracy:  {100*dt_test_acc:.2f}%")
print(classification_report(dt_train_pred, y_train))

Training Accuracy:  99.35%
Testing Accuracy:  99.46%
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      9531
           1       0.99      1.00      1.00     19489

    accuracy                           0.99     29020
   macro avg       0.99      0.99      0.99     29020
weighted avg       0.99      0.99      0.99     29020



### Ensemble

In [13]:
def ensemble(x):
    nn_pred = nn_predict(x)
    dt_pred = DecisionTree(x)
    nb_cat_pred = NB_cat(x)
    nb_num_pred = NB_num(x)

    print("Using weights (nn, dt, nb_cat, nb_num):", nn_test_acc, dt_test_acc, nb_cat_test_acc, nb_num_test_acc)
    mean_pred = np.average([nn_pred, dt_pred, nb_cat_pred, nb_num_pred], axis=0, weights=[nn_test_acc, dt_test_acc, nb_cat_test_acc, nb_num_test_acc])

    final_pred = np.rint(mean_pred)

    return final_pred

In [14]:
### determine predicted values
ensemble_train_pred = ensemble(X_train)
ensemble_test_pred = ensemble(X_test)
### accuracy
print(f"Training Accuracy:  {100*accuracy_score(y_train, ensemble_train_pred):.2f}%")
print(f"Testing Accuracy:  {100*accuracy_score(y_test, ensemble_test_pred):.2f}%")

Using weights (nn, dt, nb_cat, nb_num): 0.8716747070985528 0.9946243969676085 0.673328738800827 0.380013783597519
Using weights (nn, dt, nb_cat, nb_num): 0.8716747070985528 0.9946243969676085 0.673328738800827 0.380013783597519
Training Accuracy:  91.16%
Testing Accuracy:  90.56%
