In [None]:
# common:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
full_data.shape[0] - full_data_cln.shape[0]

# Data Cleaning

In [None]:
# read data
file_path = "./hotel_bookings.csv"
full_data = pd.read_csv(file_path)
full_data.head()

In [None]:
# stats for missing values
full_data.isnull().sum()

In [None]:
## replace missing values:
# only 3 columns have missing values: country; deposit_type; company
replace_dict = {"children:": 0.0, "country": "Unknown", "agent": 0, "company": 0}
full_data_cln = full_data.fillna(replace_dict)

# "meal" contains values "Undefined", which is equal to SC.
full_data_cln["meal"].replace("Undefined", "SC", inplace=True)

# Some rows contain entreis with 0 adults, 0 children and 0 babies. 
# drop entries with no guests.
zero_guests = list(full_data_cln.loc[full_data_cln["adults"]
                   + full_data_cln["children"]
                   + full_data_cln["babies"]==0].index)
full_data_cln.drop(full_data_cln.index[zero_guests], inplace=True)

In [None]:
# after cleaning, there is 119210 samples
full_data_cln.shape

# Cancellation Prediction

In [None]:
cancel_corr = full_data_cln.corr()["is_canceled"]
corr = cancel_corr.abs().sort_values(ascending=False)

In [None]:
ys = corr[1:]
xs = ['lead_time', 'total_of_special_requests', 'required_car_parking_spaces',\
     'booking_changes', 'previous_cancellations', 'is_repeated_guest', \
     'company', 'adults', 'previous_bookings_not_canceled', 'days_in_waiting_list',\
     'agent', 'adr', 'babies', 'stays_in_week_nights', 'arrival_date_year', \
     'arrival_date_week_number', 'arrival_date_day_of_month', 'children', 'stays_in_weekend_nights']

In [None]:
plt.bar(xs, ys)
plt.xticks(rotation=45, ha='right')
plt.xlabel("column name")
plt.ylabel("correlation")
plt.title("Correlation with Predicted Class")
plt.show()

In [None]:
cancel_counts = full_data_cln['is_canceled'].value_counts()
cancel_counts

In [None]:
# naive accuracy
plt.bar(["not cancelled", "cancelled"], cancel_counts)
plt.ylabel("counts")
plt.title("Class Distribution for Cancellation Prediction")
plt.show()
cancel_counts[0] / (cancel_counts[0] + cancel_counts[1])

In [None]:
## choose the 10 most useful features based on correlation

# all features are numerical
full_feature_list = ["lead_time", "total_of_special_requests", \
                    "required_car_parking_spaces", "booking_changes",\
                    "previous_cancellations", "is_repeated_guest",\
                    "agent", "adults", "previous_bookings_not_canceled",\
                    "days_in_waiting_list"]

X = full_data_cln.drop(["is_canceled"], axis=1)[full_feature_list]
y = full_data_cln["is_canceled"]

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

num_folds = 5 # 5 fold cross validation = 80% train, 20% validation

In [None]:
def train(model, model_name, X, y, num_folds):    
    clf = make_pipeline(StandardScaler(), model)
    scores = cross_val_score(clf, X, y, cv=num_folds, scoring='accuracy', n_jobs=-1)
    print("{} performance: {:.2f} +- {:.2f}".format(model_name, np.mean(scores), np.std(scores)))

In [None]:
# init model - Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model_name = "Gaussian Naive Bayes"

# train
train(model, model_name, X, y, num_folds)

In [None]:
# init model - Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model_name = "Logistic Regression"

# train
train(model, model_name, X, y, num_folds)

In [None]:
# init model - XGBoost
from xgboost import XGBClassifier
model = XGBClassifier()
model_name = "XGBoost"

# train
train(model, model_name, X, y, num_folds)

In [None]:
# init model - simple neural network
import torch
from torch import nn
import torch.nn.functional as F

class SimpleNet(nn.Module):
    def __init__(self, input_dim, hid_dim):
        super(SimpleNet, self).__init__()
        
        self.layer_1 = nn.Linear(input_dim, hid_dim) 
        self.layer_2 = nn.Linear(hid_dim, hid_dim)
        self.layer_out = nn.Linear(hid_dim, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(hid_dim)
        self.batchnorm2 = nn.BatchNorm1d(hid_dim)
        
    def forward(self, x):
        x = self.relu(self.layer_1(x))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        x = torch.sigmoid(x)
        return x

from sklearn.metrics import confusion_matrix, classification_report
import ipdb
    
def evaluate(model, dataloader):
    correct = 0
    total = 0

    y_pred_list = []
    y_gt_list = []
    with torch.no_grad():
        for data in dataloader:
            inputs, labels = data
            inputs, labels = inputs.float(), labels.float()
            labels = labels.unsqueeze(-1)
            # calculate outputs by running images through the network
            outputs = model(inputs)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels.squeeze()).sum().item()

    print('Accuracy of the network on test set: %d %%' % (
        100 * correct / total))

In [None]:
from sklearn.model_selection import KFold

def train_ann(X, y, num_folds, lr = 0.01, hid_dim = 64, loss_func = nn.BCELoss(), run_cv=False):
    
    kfold = KFold(n_splits=num_folds, shuffle=True)
    
    # grid search 
    for fold, (train_ids, test_ids) in enumerate(kfold.split(X)):
        # Print
        print(f'FOLD {fold}')

        # init data
        train_data = []
        test_data = []
        X_train = X.to_numpy()[train_ids]
        y_train = y.to_numpy()[train_ids]
        X_test = X.to_numpy()[test_ids]
        y_test = y.to_numpy()[test_ids]
        # normalize data
        scaler = StandardScaler().fit(X_train)
        X_train_transformed = scaler.transform(X_train)
        X_test_transformed = scaler.transform(X_test)

        for i in range(len(X_train)):
            train_data.append([X_train_transformed[i], y_train[i]])
        for i in range(len(X_test)):
            test_data.append([X_test_transformed[i], y_test[i]])

        trainloader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=512)
        testloader = torch.utils.data.DataLoader(test_data, shuffle=True, batch_size=512)

        # init model
        simple_ann = SimpleNet(X.shape[1], hid_dim)

        # init training
        optimizer = torch.optim.SGD(simple_ann.parameters(), lr=lr, momentum=0.9)
        criterion = loss_func

        # training
        for epoch in range(num_epoch):  
            running_loss = 0.0
            for i, data in enumerate(trainloader, 0):
                # get the inputs; data is a list of [inputs, labels]
                inputs, labels = data
                inputs, labels = inputs.float(), labels.float()
                labels = labels.unsqueeze(-1)
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                outputs = simple_ann(inputs)
                loss = criterion(outputs, labels)
                
                if i%200 == 199:
                    _, predicted = torch.max(outputs.data, 1)
                    num_correct = (predicted == labels.squeeze()).sum().item()
                    print(loss.item(), num_correct / 512.0)
                loss.backward()
                optimizer.step()

                # print statistics
                running_loss += loss.item()

            if epoch % 10 == 9:
                print('[epoch %d] loss: %.3f' %
                      (epoch + 1, running_loss / len(trainloader)))
        simple_ann.eval()
        evaluate(simple_ann, trainloader)
        evaluate(simple_ann, testloader)
        simple_ann.train()
        if not run_cv:
            break

In [None]:
lr_list = [0.1, 0.03, 0.01, 0.003, 0.001]
hid_dim_list = [256, 64, 16]
loss_list = [nn.BCELoss()]

num_epoch = 50

for loss in loss_list:
    for hid_dim in hid_dim_list: 
        for lr in lr_list:
            print(lr, hid_dim, loss)
            train_ann(X, y, 5, lr, hid_dim, loss, False)