In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
df = pd.read_csv('datasets/hotel_booking.csv')
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78290 entries, 0 to 78289
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   is_canceled                     78290 non-null  int64  
 1   hotel                           78290 non-null  object 
 2   lead_time                       78290 non-null  int64  
 3   arrival_date_month              78290 non-null  object 
 4   stays_in_weekend_nights         78290 non-null  int64  
 5   stays_in_week_nights            78290 non-null  int64  
 6   adults                          78290 non-null  int64  
 7   children                        78287 non-null  float64
 8   babies                          78290 non-null  int64  
 9   meal                            78290 non-null  object 
 10  country                         78290 non-null  object 
 11  previous_cancellations          78290 non-null  int64  
 12  previous_bookings_not_canceled  

Unnamed: 0,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,78290.0,78290.0,78290.0,78290.0,78290.0,78287.0,78290.0,78290.0,78290.0,78290.0,78290.0,78290.0,78290.0,78290.0
mean,0.405812,109.260135,0.882156,2.43791,1.838958,0.089312,0.008673,0.108698,0.174646,0.202286,2.86315,98.154606,0.065104,0.510001
std,0.491052,113.690202,0.986962,1.864397,0.615962,0.369761,0.104877,0.885138,1.738073,0.596063,19.670393,51.826966,0.248054,0.768116
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.38,0.0,0.0
25%,0.0,17.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,0.0,0.0
50%,0.0,71.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0
75%,1.0,169.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,0.0,1.0
max,1.0,737.0,16.0,41.0,55.0,10.0,10.0,26.0,72.0,20.0,391.0,5400.0,3.0,5.0


In [5]:
#one hot encoding
categorical_cols = ['hotel', 'arrival_date_month', 'meal', 'country', 'reserved_room_type', 'deposit_type', 'customer_type']
ohe = OneHotEncoder(drop='first', sparse_output=False)
df_encoded = pd.DataFrame(ohe.fit_transform(df[categorical_cols]))
df_encoded.columns = ohe.get_feature_names_out(categorical_cols)
df.drop(columns=categorical_cols, inplace=True)
df = pd.concat([df, df_encoded], axis=1)

In [6]:
#dropping unnecessary fields
df = df.drop(columns=['name', 'email', 'phone-number'])

In [7]:
#handling missing values
df['children'].fillna(0, inplace=True)

In [8]:
#augment features
df['total_stay'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

df['total_guests'] = df['adults'] + df['children'] + df['babies']

df['total_previous_bookings'] = df['previous_cancellations'] + df['previous_bookings_not_canceled']

df['total_cost'] = df['adr'] / df['total_stay']

df['adults_ratio'] = df['adults'] / df['total_guests']

df['cars_to_guests'] = df['required_car_parking_spaces'] / df['total_guests']

df['guests_to_cost'] = df['total_guests'] + df['total_cost']





In [9]:
#drop nonsensical entries

df = df[df['total_stay'] != 0]
df = df[df['total_guests'] != 0]

In [10]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 77640 entries, 0 to 78289
Data columns (total 55 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   is_canceled                     77640 non-null  int64  
 1   lead_time                       77640 non-null  int64  
 2   stays_in_weekend_nights         77640 non-null  int64  
 3   stays_in_week_nights            77640 non-null  int64  
 4   adults                          77640 non-null  int64  
 5   children                        77640 non-null  float64
 6   babies                          77640 non-null  int64  
 7   previous_cancellations          77640 non-null  int64  
 8   previous_bookings_not_canceled  77640 non-null  int64  
 9   booking_changes                 77640 non-null  int64  
 10  days_in_waiting_list            77640 non-null  int64  
 11  adr                             77640 non-null  float64
 12  required_car_parking_spaces     77640

Unnamed: 0,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,...,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,total_stay,total_guests,total_previous_bookings,total_cost,adults_ratio,cars_to_guests,guests_to_cost
count,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,...,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0
mean,0.408836,109.889451,0.887751,2.454019,1.842658,0.089567,0.008707,0.109493,0.174948,0.199459,...,0.005165,0.730848,0.223712,3.34177,1.940933,0.284441,43.10667,0.971302,0.036596,45.047603
std,0.491622,113.770453,0.984905,1.850976,0.612659,0.37025,0.105131,0.888699,1.742873,0.581036,...,0.071682,0.443522,0.416734,2.49011,0.738676,2.08723,40.015224,0.110159,0.150646,40.067175
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,-0.638,0.0,0.0,1.0
25%,0.0,17.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,2.0,0.0,21.0375,1.0,0.0,23.0
50%,0.0,72.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,3.0,2.0,0.0,32.5,1.0,0.0,34.5
75%,1.0,170.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,4.0,2.0,0.0,54.11,1.0,0.0,56.1
max,1.0,709.0,16.0,40.0,55.0,10.0,10.0,26.0,72.0,17.0,...,1.0,1.0,1.0,56.0,55.0,78.0,5400.0,1.0,2.0,5402.0


## Standardizing features

In [11]:
numerical_cols = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'babies', 'children', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr','required_car_parking_spaces','total_of_special_requests', 'total_stay', 'total_guests', 'total_previous_bookings','total_cost','adults_ratio', 'cars_to_guests', 'guests_to_cost' ]
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [12]:
import numpy as np
# Check for NaN or inf values
nan_mask = df.isna()


inf_mask = df.applymap(np.isinf)

nan_or_inf_mask = nan_mask | inf_mask

df[nan_or_inf_mask.any(axis=1)]

  inf_mask = df.applymap(np.isinf)


Unnamed: 0,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,...,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,total_stay,total_guests,total_previous_bookings,total_cost,adults_ratio,cars_to_guests,guests_to_cost


## SVD

In [13]:
import numpy as np
U, s, Vt = np.linalg.svd(df, full_matrices=False)

In [14]:
first_component = np.abs(Vt[0, :])

top_features_indices = np.argsort(first_component)[-8:]

top_feature_names = df.columns[top_features_indices]

top_feature_names

Index(['children', 'lead_time', 'adr', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'total_stay', 'guests_to_cost', 'total_cost'],
      dtype='object')