In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, auc
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
df = pd.read_csv(r"../input/hotel-dataset/hotels.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
print("# of NaN in each columns:", df.isnull().sum(), sep='\n')

In [None]:
data = df.copy()

## Cancellations by reprated guests

In [None]:
sns.set(style = "darkgrid")
plt.title("canceled or not", fontdict = {'fontsize':20})
ax = sns.countplot(x = "is_canceled", hue="is_repeated_guest", data=data)

## Boxplot Distribution of Nights Spent at Hotels by Market Segment and Hotel Type

In [None]:
plt.figure(figsize = (15,10))
sns.boxplot(x = "market_segment", y ="stays_in_week_nights", data=data, hue="hotel", palette="Set1");

In [None]:
plt.figure(figsize = (15,10))
sns.boxplot(x = "market_segment", y ="stays_in_weekend_nights", data=data, hue="hotel", palette="Set1");

## countplot Distribution of Market Segments

In [None]:
plt.figure(figsize= (13, 10))
sns.set(style = "darkgrid")
plt.title("Count Distribution of Segment by Deposit Type", fontdict={'fontsize':20})
ax = sns.countplot(x = "market_segment", hue="deposit_type", data=data)

In [None]:
plt.figure(figsize=(18,10))
sns.set(style="darkgrid")
plt.title("Countplot Distribution of Segment by cancellation", fontdict={'fontsize':20})
ax = sns.countplot(x = "market_segment", hue = 'is_canceled', data=data)

In [None]:
(sns.FacetGrid(data, hue = 'is_canceled',
              height = 6,
              xlim = (0, 500)).map(sns.kdeplot, 'lead_time', shade=True).add_legend())

In [None]:
plt.figure(figsize = (20,6))
sns.set(style="darkgrid")
plt.title("Total Customer = Monthly", fontdict={'fontsize':20})
ax = sns.countplot(x = "arrival_date_month", hue= "hotel", data=data)

In [None]:
plt.figure(figsize = (20, 6))
sns.barplot(x = 'arrival_date_month', y='is_canceled', data=data)

In [None]:
plt.figure(figsize=(20,6))
sns.barplot(x = 'arrival_date_month', y = 'is_canceled', hue = 'hotel', data=data);

## Preprocessing**

In [None]:
print('# of NaN in each columns:', df.isnull().sum(), sep='\n')

In [None]:
def perc_mv(x, y):
    perc = y.isnull().sum() / len(x) * 100
    return perc

print('Missing value ratios:\nCompany: {}\nAgent: {}\nCountry: {}'.format(perc_mv(df, df['company']),
                                                                         perc_mv(df, df['agent']),
                                                                         perc_mv(df, df['country'])))

In [None]:
data['agent'].value_counts().count()

In [None]:
data['company'].value_counts().count()

In [None]:
data = data.drop(['company'], axis = 1)

In [None]:
data['children'] = data['children'].fillna(0)

In [None]:
data.dtypes

In [None]:
# I wanted to label them manually. I will do the rest with get. dummies or label_encoder

data['hotel'] = data['hotel'].map({'Resort Hotel':0, 'City Hotel':1})
data['arrival_date_month'] = data['arrival_date_month'].map({'January':1,'February': 2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7,
                                                            'August':8, 'September':9, 'October':10, 'November':11, 'December':12})

In [None]:
def family(data):
    if ((data['adults'] > 0) & (data['children'] > 0)):
        val = 1
    elif((data['adults'] > 0) & (data['babies'] > 0)):
        val = 1
    else:
        val = 0
    return val

def deposit(data):
    if ((data['deposit_type'] == 'No Deposit') | (data['deposit_type'] == 'Refundable')):
        return 0
    else:
        return 1

In [None]:
def feature(data):
    data['is_family'] = data.apply(family, axis=1)
    data['total_customer'] = data['adults'] + data['children'] + data['babies']
    data['deposit_given'] = data.apply(deposit, axis=1)
    data['total_nights'] = data['stays_in_weekend_nights'] + data['stays_in_week_nights']
    return data

data = feature(data)

In [None]:
# Information of these columns is also inside of new features, so it is better to drop them.
# I did not drop stays_nights features, I can't decide which feature is more important there.

data = data.drop(columns = ['adults', 'babies', 'children', 'deposit_type', 'reservation_status_date'])

After correlation we will decide what to do about country, agent and total_nights.

## Correlation

In [None]:
data.columns

In [None]:
cor_data = data.copy()

In [None]:
le = LabelEncoder()

In [None]:
# This data will not be used while predicting cancellation. This is just for checking correlation.
cor_data['meal'] = le.fit_transform(cor_data['meal'])
cor_data['distribution_channel'] = le.fit_transform(cor_data['distribution_channel'])
cor_data['reserved_room_type'] = le.fit_transform(cor_data['reserved_room_type'])
cor_data['assigned_room_type'] = le.fit_transform(cor_data['assigned_room_type'])
cor_data['agent'] = le.fit_transform(cor_data['agent'])
cor_data['customer_type'] = le.fit_transform(cor_data['customer_type'])
cor_data['reservation_status'] = le.fit_transform(cor_data['reservation_status'])
cor_data['market_segment'] = le.fit_transform(cor_data['market_segment'])

In [None]:
cor_data.corr()

In [None]:
cor_data.corr()['is_canceled'].sort_values()

In [None]:
cor_data.corr()['stays_in_week_nights'].sort_values()

In [None]:
cor_data = cor_data.drop(columns = ['total_nights', 'arrival_date_week_number','stays_in_weekend_nights','arrival_date_month','agent'], axis=1)

In [None]:
# Lets delete the NA rows of country column
indices = cor_data.loc[pd.isna(cor_data["country"]), :].index
cor_data = cor_data.drop(cor_data.index[indices])
cor_data.isnull().sum()

In [None]:
indices = data.loc[pd.isna(data["country"]), :].index
data = data.drop(data.index[indices])
data = data.drop(columns = ['arrival_date_week_number', 'stays_in_weekend_nights', 'arrival_date_month', 'agent'], axis = 1)

In [None]:
data.columns

In [None]:
df1 = data.copy()

In [None]:
df1 = pd.get_dummies(data=df1, columns = ['meal', 'market_segment', 'distribution_channel',
                                          'reserved_room_type', 'assigned_room_type','customer_type',
                                           'reservation_status'])

In [None]:
df1['country'] = le.fit_transform(df1['country'])

In [None]:
df1.head()

# Desicion Tree Model(reservation_status included)

In [None]:
y = df1['is_canceled']
X = df1.drop(['is_canceled'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 42)

In [None]:
cart = DecisionTreeClassifier(max_depth = 12)
cart_model = cart.fit(X_train, y_train)

In [None]:
y_pred = cart_model.predict(X_test)

In [None]:
print('Decision Tree Model')
print('Accuracy Score: {}\n\nConfusion Matrix:\n {}\n\nAUC score: {}'.
      format(accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred),
             roc_auc_score(y_test, y_pred)))

In [None]:
pd.DataFrame(data = cart_model.feature_importances_*100,
             columns = ["Importances"],
             index = X_train.columns).sort_values("Importances", ascending = False)[:20].plot(kind="barh", color="r")

plt.xlabel("Feature Importance (%)")

### final arrangement before comparing the models

In [None]:
df2 = df1.drop(columns = ['reservation_status_Canceled', 'reservation_status_Check-Out','reservation_status_No-Show'], axis=1)

In [None]:
y = df2['is_canceled']
X = df2.drop(['is_canceled'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.30, random_state = 42)

In [None]:
def model(algorithm, X_train, X_test, y_train, y_test):
    alg = algorithm
    alg_model = alg.fit(X_train, y_train)
    global y_prob, y_pred
    y_prob = alg.predict_proba(X_test)[:,1]
    y_pred = alg_model.predict(X_test)
    
    print('Accuracy Score: {}\n\nConfusion Matrix:\n {}'.format(accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred)))
    
def ROC(y_test, y_prob):
    
    false_positive_rate, true_positive_rate, thresold = roc_curve(y_test,y_prob)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    
    plt.figure(figsize = (10,10))
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate, true_positive_rate, color= 'blue', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0,1], [0,1], linestyle = '--')
    plt.axis('tight')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')

## Model and ROC curve Comparision

### Logistic Regression Model

In [None]:
print('Model: Logistic Regression\n')
model(LogisticRegression(solver='liblinear'), X_train, X_test, y_train, y_test)

In [None]:
LogR = LogisticRegression(solver = "liblinear")
cv_scores = cross_val_score(LogR, X, y, cv = 8, scoring = 'accuracy')
print('Mean Score of CV: ', cv_scores.mean())

In [None]:
ROC(y_test, y_prob)

## Gaussian Naive Bayes Model

In [None]:
print('Model: Guassian Naive Bayes\n')
model(GaussianNB(), X_train, X_test, y_train, y_test)

In [None]:
NB = GaussianNB()
cv_scores = cross_val_score(NB, X, y, cv=8, scoring = 'accuracy')
print('Mean Score of CV: ', cv_scores.mean())

In [None]:
ROC(y_test, y_prob)

# Support Vector Classification

In [None]:
print('Model: SVC\n')

def model1(algorithm, X_train, X_test, y_train, y_test):
    alg = algorithm
    alg_model = alg.fit(X_train, y_train)
    global y_pred
    y_pred = alg_model.predict(X_test)
    
    print('Accuracy Score: {}\n\nConfusion Matrix:\n {}'
      .format(accuracy_score(y_test,y_pred), confusion_matrix(y_test,y_pred)))

In [None]:
model1(SVC(kernel = 'linear'), X_train, X_test, y_train, y_test)

## Decision Tree Model (reservation_status excluded)

In [None]:
print('Model: Decision Tree\n')
model(DecisionTreeClassifier(max_depth = 12), X_train, X_test, y_train, y_test)

In [None]:
DTC = DecisionTreeClassifier(max_depth = 12)
cv_scores = cross_val_score(DTC, X, y, cv=8, scoring='accuracy')
print('Mean Score of CV: ', cv_scores.mean())

In [None]:
ROC(y_test, y_prob)

In [None]:
print("Random Forest")
model1(RandomForestClassifier(), X_train, X_test, y_train, y_test)

In [None]:
RFC = RandomForestClassifier()
cv_score = cross_val_score(RFC, X, y, cv = 8, scoring='accuracy')
print('Mean Score of CV: ', cv_scores.mean())

In [None]:
ROC(y_test, y_prob)

# Random Forest Model Tuning

In [None]:
rf_parameters = {"max_depth":[10,13],
                 "n_estimators": [10, 100, 500],
                 "min_samples_split": [2, 5]}

In [None]:
rf_model = RandomForestClassifier()

In [None]:
rfc_cv_model = GridSearchCV(rf_model,
                           rf_parameters,
                           cv=10,
                           n_jobs = -1,
                           verbose = 2)
rfc_cv_model.fit(X_train, y_train)

In [None]:
print('Best parameters: ' + str(rfc_cv_model.best_params_))

In [None]:
rfc_tuned = RandomForestClassifier(max_depth =13,
                                  min_samples_split = 2,
                                  n_estimators = 500)
print('Model: Random Forest Tuned\n')
model(rfc_tuned, X_train, X_test, y_train, y_test)

# XG Boost

In [None]:
print('Model: XGBoost\n')
model(XGBClassifier(), X_train, X_test, y_train, y_test)

In [None]:
XGB = XGBClassifier()
cv_score = cross_val_score(XGB, X, y, cv=8, scoring='accuracy')
print('Mean Score of CV: ', cv_scores.mean())

In [None]:
ROC(y_test, y_prob)

# Neural Network Model

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
print('Model: Neural Network\n')
model(MLPClassifier(), X_train_scaled, X_test_scaled, y_train, y_test)

In [None]:
ROC(y_test, y_prob)

In [None]:
randomf = RandomForestClassifier()
rf_model = randomf.fit(X_train, y_train)

pd.DataFrame(data= rf_model.feature_importances_*100,
             columns = ['Importances'],
             index = X_train.columns).sort_values("Importances", ascending = False)[:15].plot(kind = 'barh', color="r")
plt.xlabel("Feature Importance (%)")

In [None]:
table = pd.DataFrame({"Model": ["Decision Tree (reservation status included)", "Logistic Regression",
                                "Naive Bayes", "Support Vector", "Decision Tree", "Random Forest",
                                "Random Forest Tuned", "XGBoost", "Neural Network", "Neural Network Tuned"],
                     "Accuracy Scores": ["1", "0.804", "0.582", "0.794", "0.846",
                                         "0.883", "0.851", "0.869", "0.848", "0.859"],
                     "ROC | Auc": ["1", "0.88", "0.78", "0",
                                   "0.92", "0.95", "0", "0.94",
                                   "0.93", "0.94"]})


table["Model"] = table["Model"].astype("category")
table["Accuracy Scores"] = table["Accuracy Scores"].astype("float32")
table["ROC | Auc"] = table["ROC | Auc"].astype("float32")

pd.pivot_table(table, index = ["Model"]).sort_values(by = 'Accuracy Scores', ascending=False)