In [35]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OrdinalEncoder    
# from sklearn.svm import SVC

In [36]:
# Read the data from the CSV file
df = pd.read_csv('hotel_bookings.csv')
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [37]:
print(df.shape)
df.info()

(119390, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  mea

In [38]:
# Checking datatypes
df.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [39]:
df.columns.tolist()

['hotel',
 'is_canceled',
 'lead_time',
 'arrival_date_year',
 'arrival_date_month',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'reserved_room_type',
 'assigned_room_type',
 'booking_changes',
 'deposit_type',
 'agent',
 'company',
 'days_in_waiting_list',
 'customer_type',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests',
 'reservation_status',
 'reservation_status_date']

## Preprocessing

In [40]:
# Selecting columns with missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

children         4
country        488
agent        16340
company     112593
dtype: int64

In [41]:
# Calculate the percentage of missing values per column
missing_values = missing_values * 100 / len(df)
missing_values

children     0.003350
country      0.408744
agent       13.686238
company     94.306893
dtype: float64

In [42]:
# Drop the columns agent , company
df.drop(['agent','company'], axis=1, inplace=True)

In [43]:
df.isnull().sum()

hotel                               0
is_canceled                         0
lead_time                           0
arrival_date_year                   0
arrival_date_month                  0
arrival_date_week_number            0
arrival_date_day_of_month           0
stays_in_weekend_nights             0
stays_in_week_nights                0
adults                              0
children                            4
babies                              0
meal                                0
country                           488
market_segment                      0
distribution_channel                0
is_repeated_guest                   0
previous_cancellations              0
previous_bookings_not_canceled      0
reserved_room_type                  0
assigned_room_type                  0
booking_changes                     0
deposit_type                        0
days_in_waiting_list                0
customer_type                       0
adr                                 0
required_car

In [45]:
# Count the no. of unique values in each column
unique_values = df.nunique()
unique_values

hotel                                2
is_canceled                          2
lead_time                          479
arrival_date_year                    3
arrival_date_month                  12
arrival_date_week_number            53
arrival_date_day_of_month           31
stays_in_weekend_nights             17
stays_in_week_nights                35
adults                              14
children                             5
babies                               5
meal                                 5
country                            177
market_segment                       8
distribution_channel                 5
is_repeated_guest                    2
previous_cancellations              15
previous_bookings_not_canceled      73
reserved_room_type                  10
assigned_room_type                  12
booking_changes                     21
deposit_type                         3
days_in_waiting_list               128
customer_type                        4
adr                      

In [10]:
# Drop the rows with missing values in the column 'country'
df = df.dropna(subset=['country'])

In [11]:
# Replace missing values in the column 'children' with 0
df['children'] = df['children'].fillna(0)

In [12]:
# Drop the rows if the values = 0 in the column 'adult' and 'children' and 'babies'
df = df.drop(df[(df.adults+df.children+df.babies)==0].index)

In [13]:
# Count the columns with missing values
if df.isnull().sum().sum() == 0:
    print('No missing values')

No missing values


In [14]:
df.shape

(118732, 30)

In [15]:
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])
df['reservation_status_date']

0        2015-07-01
1        2015-07-01
2        2015-07-02
3        2015-07-02
4        2015-07-03
            ...    
119385   2017-09-06
119386   2017-09-07
119387   2017-09-07
119388   2017-09-07
119389   2017-09-07
Name: reservation_status_date, Length: 118732, dtype: datetime64[ns]

In [16]:
# Merge the columns 'arrival_date_year', 'arrival_date_month' and 'arrival_date_day_of_month' into a single column 'arrival_date'
df['arrival_date'] = pd.to_datetime(df.arrival_date_year.astype(str) + '-' + df.arrival_date_month.astype(str) + '-' + df.arrival_date_day_of_month.astype(str))
df.drop(['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month'], axis=1, inplace=True)
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date
0,Resort Hotel,0,342,27,0,0,2,0.0,0,BB,...,3,No Deposit,0,Transient,0.00,0,0,Check-Out,2015-07-01,2015-07-01
1,Resort Hotel,0,737,27,0,0,2,0.0,0,BB,...,4,No Deposit,0,Transient,0.00,0,0,Check-Out,2015-07-01,2015-07-01
2,Resort Hotel,0,7,27,0,1,1,0.0,0,BB,...,0,No Deposit,0,Transient,75.00,0,0,Check-Out,2015-07-02,2015-07-01
3,Resort Hotel,0,13,27,0,1,1,0.0,0,BB,...,0,No Deposit,0,Transient,75.00,0,0,Check-Out,2015-07-02,2015-07-01
4,Resort Hotel,0,14,27,0,2,2,0.0,0,BB,...,0,No Deposit,0,Transient,98.00,0,1,Check-Out,2015-07-03,2015-07-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,35,2,5,2,0.0,0,BB,...,0,No Deposit,0,Transient,96.14,0,0,Check-Out,2017-09-06,2017-08-30
119386,City Hotel,0,102,35,2,5,3,0.0,0,BB,...,0,No Deposit,0,Transient,225.43,0,2,Check-Out,2017-09-07,2017-08-31
119387,City Hotel,0,34,35,2,5,2,0.0,0,BB,...,0,No Deposit,0,Transient,157.71,0,4,Check-Out,2017-09-07,2017-08-31
119388,City Hotel,0,109,35,2,5,2,0.0,0,BB,...,0,No Deposit,0,Transient,104.40,0,0,Check-Out,2017-09-07,2017-08-31


In [17]:
# Move the column 'arrival_date' to the 4th position
cols = df.columns.tolist()
cols = cols[:3] + cols[-1:] + cols[3:-1]
df = df[cols]
df


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015-07-01,27,0,0,2,0.0,0,...,C,3,No Deposit,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015-07-01,27,0,0,2,0.0,0,...,C,4,No Deposit,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015-07-01,27,0,1,1,0.0,0,...,C,0,No Deposit,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015-07-01,27,0,1,1,0.0,0,...,A,0,No Deposit,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015-07-01,27,0,2,2,0.0,0,...,A,0,No Deposit,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017-08-30,35,2,5,2,0.0,0,...,A,0,No Deposit,0,Transient,96.14,0,0,Check-Out,2017-09-06
119386,City Hotel,0,102,2017-08-31,35,2,5,3,0.0,0,...,E,0,No Deposit,0,Transient,225.43,0,2,Check-Out,2017-09-07
119387,City Hotel,0,34,2017-08-31,35,2,5,2,0.0,0,...,D,0,No Deposit,0,Transient,157.71,0,4,Check-Out,2017-09-07
119388,City Hotel,0,109,2017-08-31,35,2,5,2,0.0,0,...,A,0,No Deposit,0,Transient,104.40,0,0,Check-Out,2017-09-07


In [18]:
# Merge the rows of 'required_car_parking_spaces' into two categories: 0 and 1
print(df['required_car_parking_spaces'].value_counts())
df.loc[df['required_car_parking_spaces'] > 0, 'required_car_parking_spaces'] = 1
df['required_car_parking_spaces'].value_counts()

required_car_parking_spaces
0    111429
1      7270
2        28
3         3
8         2
Name: count, dtype: int64


required_car_parking_spaces
0    111429
1      7303
Name: count, dtype: int64

In [19]:
df.shape

(118732, 28)

In [20]:
# Save the cleaned data frame to a CSV file
df.to_csv('hotel_bookings_cleaned.csv', index=False)

In [21]:
# Select the 'is_canceled' column as y_data
y_data = df['is_canceled']
df.drop(['is_canceled'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['is_canceled'], axis=1, inplace=True)


In [22]:
# Split the data into two data frames: one for the numerical columns and one for the categorical columns
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_cat = df.select_dtypes(include = ['object'])

In [23]:
df_num.shape, df_cat.shape

((118732, 15), (118732, 10))

In [24]:
# Show the numerical columns
df_num.columns.tolist()

['lead_time',
 'arrival_date_week_number',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'booking_changes',
 'days_in_waiting_list',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests']

In [25]:
# Show the categorical columns
categorical_features = df_cat.columns.tolist()
categorical_features

['hotel',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'assigned_room_type',
 'deposit_type',
 'customer_type',
 'reservation_status']

In [26]:
# Show the unique values in each categorical column
df_cat.nunique()

hotel                     2
meal                      5
country                 177
market_segment            8
distribution_channel      5
reserved_room_type        9
assigned_room_type       11
deposit_type              3
customer_type             4
reservation_status        3
dtype: int64

In [27]:
# Sort the indices of df_num to start from 0 to n
df_num = df_num.reset_index(drop=True)
df_num

Unnamed: 0,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,342,27,0,0,2,0.0,0,0,0,0,3,0,0.00,0,0
1,737,27,0,0,2,0.0,0,0,0,0,4,0,0.00,0,0
2,7,27,0,1,1,0.0,0,0,0,0,0,0,75.00,0,0
3,13,27,0,1,1,0.0,0,0,0,0,0,0,75.00,0,0
4,14,27,0,2,2,0.0,0,0,0,0,0,0,98.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118727,23,35,2,5,2,0.0,0,0,0,0,0,0,96.14,0,0
118728,102,35,2,5,3,0.0,0,0,0,0,0,0,225.43,0,2
118729,34,35,2,5,2,0.0,0,0,0,0,0,0,157.71,0,4
118730,109,35,2,5,2,0.0,0,0,0,0,0,0,104.40,0,0


In [28]:
# Convert the categorical columns to numerical columns using the OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
x_data_encoded = ordinal_encoder.fit_transform(df_cat)

In [29]:
df_cat_encoded = pd.DataFrame(x_data_encoded, columns = df_cat.columns)
df_cat_encoded.isna().sum()

hotel                   0
meal                    0
country                 0
market_segment          0
distribution_channel    0
reserved_room_type      0
assigned_room_type      0
deposit_type            0
customer_type           0
reservation_status      0
dtype: int64

In [30]:
# Merge the numerical and categorical columns into a single data frame
x_data = pd.concat([df_num, df_cat_encoded], axis=1) 
x_data

Unnamed: 0,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,...,hotel,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status
0,342,27,0,0,2,0.0,0,0,0,0,...,1.0,0.0,135.0,3.0,1.0,2.0,2.0,0.0,2.0,1.0
1,737,27,0,0,2,0.0,0,0,0,0,...,1.0,0.0,135.0,3.0,1.0,2.0,2.0,0.0,2.0,1.0
2,7,27,0,1,1,0.0,0,0,0,0,...,1.0,0.0,59.0,3.0,1.0,0.0,2.0,0.0,2.0,1.0
3,13,27,0,1,1,0.0,0,0,0,0,...,1.0,0.0,59.0,2.0,0.0,0.0,0.0,0.0,2.0,1.0
4,14,27,0,2,2,0.0,0,0,0,0,...,1.0,0.0,59.0,6.0,3.0,0.0,0.0,0.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118727,23,35,2,5,2,0.0,0,0,0,0,...,0.0,0.0,15.0,5.0,3.0,0.0,0.0,0.0,2.0,1.0
118728,102,35,2,5,3,0.0,0,0,0,0,...,0.0,0.0,56.0,6.0,3.0,4.0,4.0,0.0,2.0,1.0
118729,34,35,2,5,2,0.0,0,0,0,0,...,0.0,0.0,43.0,6.0,3.0,3.0,3.0,0.0,2.0,1.0
118730,109,35,2,5,2,0.0,0,0,0,0,...,0.0,0.0,59.0,6.0,3.0,0.0,0.0,0.0,2.0,1.0


## Feature Selection using SFS (QDA)

In [31]:
# Perform feature selection using Sequential Feature Selector on QDA
qda = QuadraticDiscriminantAnalysis()
sfs = SequentialFeatureSelector(qda, direction='forward', n_features_to_select=7)     # add a new feature as long as 0.01 or more improvement
sfs.fit(x_data, y_data)
x_data_s = sfs.transform(x_data)
sfs.get_feature_names_out()

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  return -0.5 * (norm2 + u) + np.log(self.priors_)
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  return -0.5 * (norm2 + u) + np.log(self.priors_)
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  return -0.5 * (norm2 + u) + np.log(self.priors_)
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  return -0.5 * (norm2 + u) + np.log(self.priors_)
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  return -0.5 * (norm2 + u) + np.log(self.priors_)
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  return -0.5 * (norm2 + u) + np.log(self.priors_)
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
 

array(['previous_cancellations', 'previous_bookings_not_canceled',
       'total_of_special_requests', 'hotel', 'meal', 'assigned_room_type',
       'deposit_type'], dtype=object)

In [32]:
# Evaluate the accuracy with and without feature selection
qda.fit(x_data, y_data)
print(f"Score without feature selection: {qda.score(x_data, y_data)}") 
qda.fit(x_data_s, y_data)
print(f"Score with feature selection: {qda.score(x_data_s, y_data)}")

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))


Score without feature selection: 0.628221540949365
Score with feature selection: 0.7665246100461544


  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


## Feature Selection using SFS (SVM)

In [33]:
# # Perform feature selection using Sequential Feature Selector on SVM
# svm = SVC()
# sfs = SequentialFeatureSelector(svm, direction='forward', tol=0.01)     # add a new feature as long as 0.01 or more improvement
# sfs.fit(x_data, y_data)
# x_data_s = sfs.transform(x_data)
# sfs.get_feature_names_out()

## Feature Selection using Chi-Square

In [34]:
# Create a function to calculate the chi-square test
def chi_square_test(df, col1, col2):
    contingency_table = pd.crosstab(df[col1], df[col2])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return p

# Perform the chi-square test for each categorical column with the target column 'is_canceled'
p_values = {}
for col in df_cat.columns:
    p = chi_square_test(df, col, 'is_canceled')
    p_values[col] = p

# Reject the null hypothesis if the p-value is less than 0.05
significant_features = [k for k, v in p_values.items() if v < 0.05]
significant_features


KeyError: 'is_canceled'

## KNN using MapReduce

In [None]:
# Implement KNN algorithm from scratch to predict the 'is_canceled' column using MapReduce
def mapper(row):
    return row

