In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
data = pd.read_csv("datasets/data_sample.csv")
dest = pd.read_csv("datasets/dest.zip", delimiter = "\t", compression="zip")

In [None]:
print(data.shape)
print(dest.shape)

In [None]:
merged = pd.merge(data, dest)
merged.shape

In [None]:
merged.columns.tolist()

In [None]:
# Set the maximum number of rows to display
pd.set_option('display.max_rows', None)
missing_values = merged.isna().sum()
# Print the number of missing values in each column
print(missing_values)

In [None]:
# drop missing values
merged.dropna(inplace=True)

In [None]:
# Set the maximum number of rows to display
pd.set_option('display.max_rows', None)
missing_values = merged.isna().sum()
# Print the number of missing values in each column
print(missing_values)

In [None]:
merged.shape

In [None]:
final_data = merged.iloc[:,0:32]

In [None]:
final_data.head()

In [None]:
final_data.columns

In [None]:
print(final_data.dtypes)

In [None]:
plt.figure(figsize = (24, 12))

corr = final_data.corr()
sns.heatmap(corr, annot = True, linewidths = 1)
plt.show()

In [None]:
correlation = final_data.corr()["is_booking"].abs().sort_values(ascending = False)
correlation

In [None]:
final_data.head()

In [None]:
# convert the date-time column to a Pandas datetime object
final_data['date_time'] = pd.to_datetime(final_data['date_time'])

# extract the year, month, and day as numerical features
final_data['year'] = final_data['date_time'].dt.year
final_data['month'] = final_data['date_time'].dt.month
final_data['day'] = final_data['date_time'].dt.day

# drop the original date-time column and any other irrelevant columns
final_data = final_data.drop(['date_time','Unnamed: 0','user_id','site_name','user_location_country',
                              'user_location_region', 'user_location_city', 'srch_ci', 
                              'srch_co', 'hotel_country','srch_destination_name'], axis=1)

In [None]:
final_data.head()

In [None]:
# creating numerical and categorical dataframes

cat_cols = [col for col in final_data.columns if final_data[col].dtype == 'O']
cat_cols

In [None]:
cat_df = final_data[cat_cols]
cat_df.head()

In [None]:
# printing unique values of each column
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

In [None]:
# create a LabelEncoder object
le = LabelEncoder()

# apply label encoding to the distance_band column
final_data['distance_band_encoded'] = le.fit_transform(final_data['distance_band'])

# apply label encoding to the hist_price_band column
final_data['hist_price_band_encoded'] = le.fit_transform(final_data['hist_price_band'])

# apply label encoding to the popularity_band column
final_data['popularity_band_encoded'] = le.fit_transform(final_data['popularity_band'])


In [None]:
final_data.head()

In [None]:
final_data.columns

In [None]:
final_data = final_data.drop(columns = cat_cols, axis = 1)

In [None]:
final_data.columns

In [None]:
X = final_data.drop('is_booking', axis=1)
y = final_data['is_booking']

In [None]:
X.shape, y.shape

In [None]:


# splitting data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head(), y_test.head()

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

acc_lr = accuracy_score(y_test, y_pred_lr)
conf = confusion_matrix(y_test, y_pred_lr)
clf_report = classification_report(y_test, y_pred_lr)

print(f"Accuracy Score of Logistic Regression is : {acc_lr}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

acc_knn = accuracy_score(y_test, y_pred_knn)
conf = confusion_matrix(y_test, y_pred_knn)
clf_report = classification_report(y_test, y_pred_knn)

print(f"Accuracy Score of KNN is : {acc_knn}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

y_pred_dtc = dtc.predict(X_test)

acc_dtc = accuracy_score(y_test, y_pred_dtc)
conf = confusion_matrix(y_test, y_pred_dtc)
clf_report = classification_report(y_test, y_pred_dtc)

print(f"Accuracy Score of Decision Tree is : {acc_dtc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
rd_clf = RandomForestClassifier()
rd_clf.fit(X_train, y_train)

y_pred_rd_clf = rd_clf.predict(X_test)

acc_rd_clf = accuracy_score(y_test, y_pred_rd_clf)
conf = confusion_matrix(y_test, y_pred_rd_clf)
clf_report = classification_report(y_test, y_pred_rd_clf)

print(f"Accuracy Score of Random Forest is : {acc_rd_clf}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
ada = AdaBoostClassifier(base_estimator = dtc)
ada.fit(X_train, y_train)

y_pred_ada = ada.predict(X_test)

acc_ada = accuracy_score(y_test, y_pred_ada)
conf = confusion_matrix(y_test, y_pred_ada)
clf_report = classification_report(y_test, y_pred_ada)

print(f"Accuracy Score of Ada Boost Classifier is : {acc_ada}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

acc_gb = accuracy_score(y_test, y_pred_gb)
conf = confusion_matrix(y_test, y_pred_gb)
clf_report = classification_report(y_test, y_pred_gb)

print(f"Accuracy Score of Gradient Boosting Classifier is : {acc_gb}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.1, max_depth = 5, n_estimators = 180)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

acc_xgb = accuracy_score(y_test, y_pred_xgb)
conf = confusion_matrix(y_test, y_pred_xgb)
clf_report = classification_report(y_test, y_pred_xgb)

print(f"Accuracy Score of XGB Classifier is : {acc_xgb}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
cat = CatBoostClassifier(iterations=100)
cat.fit(X_train, y_train)

y_pred_cat = cat.predict(X_test)

acc_cat = accuracy_score(y_test, y_pred_cat)
conf = confusion_matrix(y_test, y_pred_cat)
clf_report = classification_report(y_test, y_pred_cat)
print(f"Accuracy Score of Cat Boost Classifier is : {acc_cat}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)

y_pred_etc = etc.predict(X_test)

acc_etc = accuracy_score(y_test, y_pred_etc)
conf = confusion_matrix(y_test, y_pred_etc)
clf_report = classification_report(y_test, y_pred_etc)

print(f"Accuracy Score of Extra Trees Classifier is : {acc_etc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
lgbm = LGBMClassifier(learning_rate = 1)
lgbm.fit(X_train, y_train)

y_pred_lgbm = lgbm.predict(X_test)

acc_lgbm = accuracy_score(y_test, y_pred_lgbm)
conf = confusion_matrix(y_test, y_pred_lgbm)
clf_report = classification_report(y_test, y_pred_lgbm)

print(f"Accuracy Score of LGBM Classifier is : {acc_lgbm}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
classifiers = [('Gradient Boosting Classifier', gb), ('Cat Boost Classifier', cat), ('XGboost', xgb),  ('Decision Tree', dtc),
               ('Extra Tree', etc), ('Light Gradient', lgbm), ('Random Forest', rd_clf), ('Ada Boost', ada), ('Logistic', lr),
               ('Knn', knn)]
vc = VotingClassifier(estimators = classifiers)
vc.fit(X_train, y_train)
y_pred_vc = vc.predict(X_test)

acc_vtc = accuracy_score(y_test, y_pred_vc)
conf = confusion_matrix(y_test, y_pred_vc)
clf_report = classification_report(y_test, y_pred_vc)

print(f"Accuracy Score of Voting Classifier is : {acc_vtc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
models = pd.DataFrame({
    'Model' : ['Logistic Regression', 'KNN', 'Decision Tree Classifier', 'Random Forest Classifier','Ada Boost Classifier',
             'Gradient Boosting Classifier', 'XgBoost', 'Cat Boost', 'Extra Trees Classifier', 'LGBM', 'Voting Classifier'],
    'Score' : [acc_lr, acc_knn, acc_dtc, acc_rd_clf, acc_ada, acc_gb, acc_xgb, acc_cat, acc_etc, acc_lgbm, acc_vtc]
})


models.sort_values(by = 'Score', ascending = False)

In [None]:
import matplotlib.pyplot as plt

# Define the models and scores
models = ['Logistic Regression', 'KNN', 'Decision Tree Classifier', 'Random Forest Classifier', 
          'Ada Boost Classifier', 'Gradient Boosting Classifier', 'XgBoost', 'Cat Boost', 
          'Extra Trees Classifier', 'LGBM', 'Voting Classifier']
scores = [0.910654, 0.905257, 0.830323, 0.871377, 0.846670, 0.910622, 0.910373, 0.907409, 
          0.861176, 0.886476, 0.909281]

# Sort the models and scores in descending order by scores
models_scores = sorted(zip(models, scores), key=lambda x: x[1], reverse=True)
models_sorted = [model[0] for model in models_scores]
scores_sorted = [model[1] for model in models_scores]

# Define the colors for each bar
colors = ['darkred', 'indianred', 'salmon', 'coral', 'khaki', 'mediumaquamarine', 'lightskyblue', 
          'steelblue', 'slateblue', 'violet', 'crimson']

fig, ax = plt.subplots(figsize=(10, 6))

# Create horizontal bar chart
ax.barh(models_sorted, scores_sorted, color=colors)

# Set chart title and axis labels
ax.set_title('Models Comparison', fontsize=18)
ax.set_xlabel('Score', fontsize=14)
ax.set_ylabel('Model', fontsize=14)

# Set y-axis tick font size
ax.tick_params(axis='y', labelsize=12)

# Invert y-axis to show highest score at the top
ax.invert_yaxis()

# Add percentage amount in front of each bar
for i, v in enumerate(scores_sorted):
    ax.text(v + 0.005, i, f'{v*100:.2f}%', color='black', fontsize=12, va='center')

# Set x-axis limit
ax.set_xlim([0.8, 0.95])
plt.show()