In [1]:
!pip install --upgrade tensorflow



In [2]:
import numpy as np
import pandas as pd

In [3]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

In [4]:
import keras
print(keras.__version__)

3.8.0


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# loading the dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DL/7.project1/will_not_travel_again/data/train.csv', engine='pyarrow')
data

Unnamed: 0,user,user_location_country,user_location_region,user_location_city,destination_distance,search_date,is_mobile,is_package,channel,search_count,...,n_adults,n_children,n_rooms,destination,destination_type,hotel_continent,hotel_country,hotel_market,hotel_category,is_booking
0,461899,3,50,5703,,2013-01-07 00:00:02,0,0,9,1,...,2,1,1,669,3,2,50,212,41,0
1,13796,66,174,21177,5713.6206,2013-01-07 00:00:06,0,0,9,3,...,1,0,1,8821,1,6,17,30,58,0
2,1128575,205,155,14703,795.7298,2013-01-07 00:00:06,0,0,9,1,...,1,0,1,25064,6,2,50,1230,91,0
3,1080476,69,761,41949,,2013-01-07 00:00:17,0,1,9,1,...,2,0,1,7635,3,2,50,675,10,0
4,1080476,69,761,41949,,2013-01-07 00:00:23,0,1,9,1,...,2,0,1,7635,3,2,50,675,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7622995,672784,66,174,9821,2440.0044,2014-11-30 23:59:18,0,0,9,1,...,2,0,1,12666,5,4,131,167,82,0
7622996,202712,0,444,2681,,2014-11-30 23:59:30,1,0,9,1,...,2,0,1,20456,1,3,104,60,46,0
7622997,733416,66,174,42538,236.7748,2014-11-30 23:59:38,0,0,9,1,...,2,0,1,8250,1,2,50,628,1,0
7622998,1007109,119,0,9622,,2014-11-30 23:59:38,1,1,9,1,...,1,0,1,8822,1,3,130,91,62,0


In [8]:
# splitting the dataset
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size = 0.2)

In [9]:
# Removing the 'user' column from train and test datasets
del train['user']
del test['user']

In [10]:
# Checking for missing values in the train dataset
print('training dataset missing values', train.isna().sum())
print('test dataset missing values', test.isna().sum())

training dataset missing values user_location_country          0
user_location_region           0
user_location_city             0
destination_distance     2180504
search_date                    0
is_mobile                      0
is_package                     0
channel                        0
search_count                   0
checkIn_date               11430
checkOut_date              11430
n_adults                       0
n_children                     0
n_rooms                        0
destination                    0
destination_type               0
hotel_continent                0
hotel_country                  0
hotel_market                   0
hotel_category                 0
is_booking                     0
dtype: int64
test dataset missing values user_location_country         0
user_location_region          0
user_location_city            0
destination_distance     546703
search_date                   0
is_mobile                     0
is_package                    0
channel   

In [11]:
# checking for rows with missing check-in or check-out dates in the train set
null_check = train[train['checkIn_date'].isnull() | train['checkOut_date'].isnull()]
print(null_check['is_booking'].value_counts(normalize=True))

is_booking
0    1.0
Name: proportion, dtype: float64


In [12]:
train = train.dropna(subset=['checkIn_date', 'checkOut_date'])

In [13]:
train.isna().mean()['destination_distance']

0.3574210157106081

In [14]:
# filling missing destination_distance values with unseen data like 0
train['destination_distance'] = train['destination_distance'].fillna(0)
test['destination_distance'] =  test['destination_distance'].fillna(0)

In [15]:
print('training dataset missing values', train.isna().sum())
print('test dataset missing values', test.isna().sum())

training dataset missing values user_location_country    0
user_location_region     0
user_location_city       0
destination_distance     0
search_date              0
is_mobile                0
is_package               0
channel                  0
search_count             0
checkIn_date             0
checkOut_date            0
n_adults                 0
n_children               0
n_rooms                  0
destination              0
destination_type         0
hotel_continent          0
hotel_country            0
hotel_market             0
hotel_category           0
is_booking               0
dtype: int64
test dataset missing values user_location_country       0
user_location_region        0
user_location_city          0
destination_distance        0
search_date                 0
is_mobile                   0
is_package                  0
channel                     0
search_count                0
checkIn_date             2862
checkOut_date            2862
n_adults                    0


In [16]:
# converting date columns to datetime format for train and test sets
train['search_date'] = pd.to_datetime(train['search_date'])
train['checkIn_date'] = pd.to_datetime(train['checkIn_date'])
train['checkOut_date'] = pd.to_datetime(train['checkOut_date'])

test['search_date'] = pd.to_datetime(test['search_date'])
test['checkIn_date'] = pd.to_datetime(test['checkIn_date'])
test['checkOut_date'] = pd.to_datetime(test['checkOut_date'])

In [17]:
# calculating duration of stay and days between search and check-in for train and test sets
duration = train['checkOut_date'] - train['checkIn_date']
train['duration'] = duration.dt.days
duration = test['checkOut_date'] - test['checkIn_date']
test['duration'] = duration.dt.days

days_between = train['checkIn_date'] - train['search_date']
train['days_between'] = days_between.dt.days
days_between = test['checkIn_date'] - test['search_date']
test['days_between'] = days_between.dt.days

In [18]:
# extracting date-related features from search and check-in dates for train and test sets
train['search_date_hour'] = train['search_date'].dt.hour
train['search_date_dayofweek'] = train['search_date'].dt.dayofweek
train['checkIn_date_dayofweek'] = train['checkIn_date'].dt.dayofweek
train['search_date_month'] = train['search_date'].dt.month
train['checkIn_date_month'] = train['checkIn_date'].dt.month

test['search_date_hour'] = test['search_date'].dt.hour
test['search_date_dayofweek'] = test['search_date'].dt.dayofweek
test['checkIn_date_dayofweek'] = test['checkIn_date'].dt.dayofweek
test['search_date_month'] = test['search_date'].dt.month
test['checkIn_date_month'] = test['checkIn_date'].dt.month

In [19]:
del train['search_date']
del train['checkIn_date']
del train['checkOut_date']

del test['search_date']
del test['checkIn_date']
del test['checkOut_date']

In [20]:
is_booked = train[train['is_booking'] == 1]
not_booked = train[train['is_booking'] == 0]

In [21]:
train

Unnamed: 0,user_location_country,user_location_region,user_location_city,destination_distance,is_mobile,is_package,channel,search_count,n_adults,n_children,...,hotel_market,hotel_category,is_booking,duration,days_between,search_date_hour,search_date_dayofweek,checkIn_date_dayofweek,search_date_month,checkIn_date_month
4504796,66,348,41309,3602.1955,1,0,0,1,2,0,...,27,86,0,5,158,14,3,1,2,7
1984325,69,811,16287,0.0000,0,1,9,2,1,0,...,368,48,0,4,41,12,5,5,7,8
7097908,66,174,54089,394.7190,0,1,9,3,3,0,...,628,1,0,2,6,10,1,1,10,10
3672845,66,293,53078,175.4371,0,0,9,2,2,2,...,647,48,0,3,24,20,4,1,12,12
1949883,231,49,8351,0.0000,1,1,9,1,2,0,...,69,82,0,8,64,10,2,4,7,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6514183,70,50,26680,0.0000,0,1,2,1,2,0,...,83,62,0,2,32,8,4,2,8,9
3379068,57,342,5021,0.0000,0,1,9,4,1,0,...,701,25,0,2,73,9,6,3,11,1
1559409,66,467,42586,952.8549,0,0,9,3,1,2,...,503,20,0,1,21,19,1,2,5,6
7087115,77,824,22772,3123.2324,0,1,5,4,2,0,...,185,31,0,7,53,10,0,5,10,12


In [22]:
# comparing search hour distribution for booked and not booked data
import plotly.graph_objects as go

not_booked_hours = not_booked['search_date_hour'].value_counts(normalize=True).sort_index()
is_booked_hours = is_booked['search_date_hour'].value_counts(normalize=True).sort_index()

trace_not_booked = go.Bar(y=not_booked_hours.values, name='Not Booked')
trace_is_booked = go.Bar(y=is_booked_hours.values, name='Booked')

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Search Hour', tickangle=45, automargin=True),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('/content/drive/MyDrive/Colab Notebooks/DL/7.project1/will_not_travel_again/notebook/search_hour.json')

In [39]:
# categorizing search time into bins and displaying distribution for booked vs not booked data
not_booked.loc[:, 'search_time_bin'] = not_booked['search_date_hour'].apply(lambda x: 1 if 5 <= x <= 15 else 0)
is_booked.loc[:, 'search_time_bin'] = is_booked['search_date_hour'].apply(lambda x: 1 if 5 <= x <= 15 else 0)

print("Not Booked Search Time Bin Distribution:")
print(not_booked['search_time_bin'].value_counts(normalize=True))

print("Is Booked Search Time Bin Distribution:")
print(is_booked['search_time_bin'].value_counts(normalize=True))

Not Booked Search Time Bin Distribution:
search_time_bin
1    0.520531
0    0.479469
Name: proportion, dtype: float64
Is Booked Search Time Bin Distribution:
search_time_bin
1    0.567044
0    0.432956
Name: proportion, dtype: float64


In [24]:
# visualizing the day-of-week booking distribution with bar plot for booked vs not booked data
booked_counts = is_booked['checkIn_date_dayofweek'].value_counts(normalize=True).sort_index()
not_booked_counts = not_booked['checkIn_date_dayofweek'].value_counts(normalize=True).sort_index()

trace_not_booked = go.Bar(y = not_booked_counts, name='Not Booked')
trace_is_booked = go.Bar(y = booked_counts, name='Booked')

ticktext = ['دوشنبه', 'سه‌شنبه', 'چهارشنبه', 'پنج‌شنبه', 'جمعه', 'شنبه', 'یکشنبه']

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Day of Week', tickangle=45, automargin=True,
               tickvals = [0,1,2,3,4,5,6], ticktext= ticktext
 ),
    yaxis=dict(title='Frequency'),
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('/content/drive/MyDrive/Colab Notebooks/DL/7.project1/will_not_travel_again/notebook/checkIn_day.json')

In [25]:
# visualizing the month-wise booking distribution with bar plot for booked vs not booked data
booked_counts = is_booked['checkIn_date_month'].value_counts(normalize=True).sort_index()
not_booked_counts = not_booked['checkIn_date_month'].value_counts(normalize=True).sort_index()

trace_not_booked = go.Bar(y = not_booked_counts, name='Not Booked')
trace_is_booked = go.Bar(y = booked_counts, name='Booked')

data = [trace_is_booked, trace_not_booked]

ticktext = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

layout = go.Layout(
    xaxis=dict(title='Month', tickangle=45, automargin=True,
             ticktext = ticktext ,tickvals = np.arange(0,12)),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('/content/drive/MyDrive/Colab Notebooks/DL/7.project1/will_not_travel_again/notebook/checkIn_date_month.json')

In [38]:
# mapping months to seasons and displaying season distribution for booked and not booked data
def month_to_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

not_booked.loc[:, 'season'] = not_booked['search_date_month'].apply(month_to_season)
is_booked.loc[:, 'season'] = is_booked['search_date_month'].apply(month_to_season)

print("Not Booked Season Distribution:")
print(not_booked['season'].value_counts(normalize=True))

print("\nIs Booked Season Distribution:")
print(is_booked['season'].value_counts(normalize=True))

Not Booked Season Distribution:
season
Summer    0.266666
Spring    0.265571
Fall      0.262055
Winter    0.205708
Name: proportion, dtype: float64

Is Booked Season Distribution:
season
Fall      0.271526
Spring    0.263881
Summer    0.251958
Winter    0.212635
Name: proportion, dtype: float64


In [27]:
# visualizing the frequency of 'days_between' for booked and not booked entries
booked_counts = is_booked['days_between'].value_counts(normalize=True).sort_index()
not_booked_counts = not_booked['days_between'].value_counts(normalize=True).sort_index()

trace_not_booked = go.Scatter(y = not_booked_counts , name='Not Booked', opacity = 0.5)
trace_is_booked = go.Scatter(y = booked_counts , name='Booked')

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Days between search and checking time', tickangle=45, automargin=True),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('/content/drive/MyDrive/Colab Notebooks/DL/7.project1/will_not_travel_again/notebook/days_between.json')

In [28]:
# visualizing the frequency of 'duration' for booked and not booked entries
booked_counts = is_booked['duration'].value_counts(normalize=True).sort_index()
not_booked_counts = not_booked['duration'].value_counts(normalize=True).sort_index()

trace_not_booked = go.Scatter(y = not_booked_counts , name='Not Booked', opacity = 0.5)
trace_is_booked = go.Scatter(y = booked_counts , name='Booked')

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Length of Stay', tickangle=45, automargin=True),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('/content/drive/MyDrive/Colab Notebooks/DL/7.project1/will_not_travel_again/notebook/duration_frequency.json')

In [29]:
# creating 'is_abroad' feature to check if the user location is different from hotel country
train['is_abroad'] = (train['user_location_country'] != train['hotel_country']).astype(int)
test['is_abroad'] = (test['user_location_country'] != test['hotel_country']).astype(int)

In [30]:
del train['user_location_country']
del train['hotel_country']

del test['user_location_country']
del test['hotel_country']

In [31]:
# applying one-hot encoding to categorical columns and ensuring consistency between train and test datasets
dummy_columns = ['channel']

train = pd.get_dummies(train, columns = dummy_columns)
test = pd.get_dummies(test, columns = dummy_columns)

test['channel_10'] = 0

In [32]:
# balancing the dataset by dropping samples from the majority class
train = train.drop(index = train[train['is_booking'] == False].sample(frac =.905).index)
train = train.astype(np.float32)
test = test.astype(np.float32)

In [33]:
# splitting the dataset into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(columns = ['is_booking'])
                                                      ,train['is_booking'], test_size = 0.05)

In [34]:
X_test = test.drop(columns = ['is_booking'])
y_test = test['is_booking']

In [35]:
# building a neural network model with batch normalization and dropout
model = keras.models.Sequential()
model.add(keras.layers.Input(shape=(X_train.shape[1],)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(100, activation='relu'))
model.add(keras.layers.Dropout(0.1))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(50, activation='relu'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [36]:
# compiling the model
model.compile(optimizer = keras.optimizers.Adam(),
              loss = keras.losses.BinaryCrossentropy(),
              metrics = ['auc'])

In [40]:
# training the model
epochs = 100
BATCH_SIZE = 4096

history = model.fit(X_train, y_train,
                    batch_size = BATCH_SIZE,
                    epochs = epochs,
                    validation_data = (X_valid, y_valid))

Epoch 1/100
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 28ms/step - auc: 0.7522 - loss: 0.5678 - val_auc: 0.7627 - val_loss: 0.5600
Epoch 2/100
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 29ms/step - auc: 0.7586 - loss: 0.5623 - val_auc: 0.7643 - val_loss: 0.5585
Epoch 3/100
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 39ms/step - auc: 0.7601 - loss: 0.5609 - val_auc: 0.7654 - val_loss: 0.5574
Epoch 4/100
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 34ms/step - auc: 0.7629 - loss: 0.5585 - val_auc: 0.7664 - val_loss: 0.5564
Epoch 5/100
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 37ms/step - auc: 0.7638 - loss: 0.5577 - val_auc: 0.7669 - val_loss: 0.5560
Epoch 6/100
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 34ms/step - auc: 0.7641 - loss: 0.5569 - val_auc: 0.7673 - val_loss: 0.5553
Epoch 7/100
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9

In [41]:
# predicting on the train and test sets
y_train_pred = (model.predict(X_train, batch_size = 128) > 0.5).astype(int)
y_test_pred = (model.predict(X_test, batch_size = 128) > 0.5).astype(int)

[1m7851/7851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step
[1m11911/11911[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 2ms/step


In [42]:
# model evaluation
from sklearn.metrics import roc_auc_score

score_train = roc_auc_score(y_train, y_train_pred)
score_test = roc_auc_score(y_test, y_test_pred)

print(f"Model ROC AUC score on training set: {round(score_train, 3)}%")
print(f"Model ROC AUC on test set: {round(score_test, 3)}%")

Model ROC AUC score on training set: 0.705%
Model ROC AUC on test set: 0.699%
