In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OrdinalEncoder    
# from sklearn.svm import SVC

In [None]:
# Read the data from the CSV file
df = pd.read_csv('hotel_bookings.csv')
df.head()

In [None]:
print(df.shape)
df.info()

In [None]:
# Checking datatypes
df.dtypes

In [None]:
df.columns.tolist()

## Preprocessing

In [None]:
# Selecting columns with missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

In [None]:
# Calculate the percentage of missing values per column
missing_values = missing_values * 100 / len(df)
missing_values

In [None]:
# drop the columns agent , company
df.drop(['agent','company'], axis=1, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# Drop the rows with missing values in the column 'country'
df = df.dropna(subset=['country'])

In [None]:
# Replace missing values in the column 'children' with 0
df['children'] = df['children'].fillna(0)

In [None]:
# Drop the rows if the values = 0 in the column 'adult' and 'children' and 'babies'
df = df.drop(df[(df.adults+df.children+df.babies)==0].index)

In [None]:
# Count the columns with missing values
if df.isnull().sum().sum() == 0:
    print('No missing values')

In [None]:
df.shape

In [None]:
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])
df['reservation_status_date']

In [None]:
# Merge the columns 'arrival_date_year', 'arrival_date_month' and 'arrival_date_day_of_month' into a single column 'arrival_date'
df['arrival_date'] = pd.to_datetime(df.arrival_date_year.astype(str) + '-' + df.arrival_date_month.astype(str) + '-' + df.arrival_date_day_of_month.astype(str))
df.drop(['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month'], axis=1, inplace=True)
df

In [None]:
# Move the column 'arrival_date' to the 4th position
cols = df.columns.tolist()
cols = cols[:3] + cols[-1:] + cols[3:-1]
df = df[cols]
df


In [None]:
# Merge the rows of 'required_car_parking_spaces' into two categories: 0 and 1
print(df['required_car_parking_spaces'].value_counts())
df.loc[df['required_car_parking_spaces'] > 0, 'required_car_parking_spaces'] = 1
df['required_car_parking_spaces'].value_counts()

In [None]:
df.shape

In [None]:
# Save the cleaned data frame to a CSV file
df.to_csv('hotel_bookings_cleaned.csv', index=False)

In [None]:
# Select the 'is_canceled' column as y_data
y_data = df['is_canceled']
df.drop(['is_canceled'], axis=1, inplace=True)

In [None]:
# Split the data into two data frames: one for the numerical columns and one for the categorical columns
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_cat = df.select_dtypes(include = ['object'])

In [None]:
df_num.shape, df_cat.shape

In [None]:
# Show the numerical columns
df_num.columns.tolist()

In [None]:
# Show the categorical columns
categorical_features = df_cat.columns.tolist()
categorical_features

In [None]:
# Show the unique values in each categorical column
df_cat.nunique()

In [None]:
# Sort the indices of df_num to start from 0 to n
df_num = df_num.reset_index(drop=True)
df_num

In [None]:
# Convert the categorical columns to numerical columns using the OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
x_data_encoded = ordinal_encoder.fit_transform(df_cat)

In [None]:
df_cat_encoded = pd.DataFrame(x_data_encoded, columns = df_cat.columns)
df_cat_encoded.isna().sum()

In [None]:
# Merge the numerical and categorical columns into a single data frame
x_data = pd.concat([df_num, df_cat_encoded], axis=1) 
x_data

## Feature Selection using SFS (QDA)

In [None]:
# Perform feature selection using Sequential Feature Selector on QDA
qda = QuadraticDiscriminantAnalysis()
sfs = SequentialFeatureSelector(qda, direction='forward', n_features_to_select=7)     # add a new feature as long as 0.01 or more improvement
sfs.fit(x_data, y_data)
x_data_s = sfs.transform(x_data)
sfs.get_feature_names_out()

In [None]:
# Evaluate the accuracy with and without feature selection
qda.fit(x_data, y_data)
print(f"Score without feature selection: {qda.score(x_data, y_data)}") 
qda.fit(x_data_s, y_data)
print(f"Score with feature selection: {qda.score(x_data_s, y_data)}")

## Feature Selection using SFS (SVM)

In [None]:
# # Perform feature selection using Sequential Feature Selector on SVM
# svm = SVC()
# sfs = SequentialFeatureSelector(svm, direction='forward', tol=0.01)     # add a new feature as long as 0.01 or more improvement
# sfs.fit(x_data, y_data)
# x_data_s = sfs.transform(x_data)
# sfs.get_feature_names_out()

## Feature Selection using Chi-Square

In [None]:
# Create a function to calculate the chi-square test
def chi_square_test(df, col1, col2):
    contingency_table = pd.crosstab(df[col1], df[col2])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return p

# Perform the chi-square test for each categorical column with the target column 'is_canceled'
p_values = {}
for col in df_cat.columns:
    p = chi_square_test(df, col, 'is_canceled')
    p_values[col] = p

# Reject the null hypothesis if the p-value is less than 0.05
significant_features = [k for k, v in p_values.items() if v < 0.05]
significant_features
