In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
df = pd.read_csv('datasets/hotel_booking.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78290 entries, 0 to 78289
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   is_canceled                     78290 non-null  int64  
 1   hotel                           78290 non-null  object 
 2   lead_time                       78290 non-null  int64  
 3   arrival_date_month              78290 non-null  object 
 4   stays_in_weekend_nights         78290 non-null  int64  
 5   stays_in_week_nights            78290 non-null  int64  
 6   adults                          78290 non-null  int64  
 7   children                        78287 non-null  float64
 8   babies                          78290 non-null  int64  
 9   meal                            78290 non-null  object 
 10  country                         78290 non-null  object 
 11  previous_cancellations          78290 non-null  int64  
 12  previous_bookings_not_canceled  

In [12]:
#one hot encoding
categorical_cols = ['hotel', 'arrival_date_month', 'meal', 'country', 'reserved_room_type', 'deposit_type', 'customer_type']
ohe = OneHotEncoder(drop='first', sparse_output=False)
df_encoded = pd.DataFrame(ohe.fit_transform(df[categorical_cols]))
df_encoded.columns = ohe.get_feature_names_out(categorical_cols)
df.drop(columns=categorical_cols, inplace=True)
df = pd.concat([df, df_encoded], axis=1)

In [13]:
#dropping unnecessary fields
df = df.drop(columns=['name', 'email', 'phone-number'])

In [14]:
#handling missing values
df['children'].fillna(0, inplace=True)

In [15]:
#augment features
df['total_stay'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

df['total_guests'] = df['adults'] + df['children'] + df['babies']

df['total_previous_bookings'] = df['previous_cancellations'] + df['previous_bookings_not_canceled']

df['cancel_rate'] = df['previous_cancellations'] / df['total_previous_bookings']

df['total_cost'] = df['adr'] / df['total_stay']

df['adults_ratio'] = df['adults'] / df['total_guests']

df['cars_to_guests'] = df['required_car_parking_spaces'] / df['total_guests']

df['guests_to_cost'] = df['total_guests'] + df['total_cost']





In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78290 entries, 0 to 78289
Data columns (total 56 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   is_canceled                     78290 non-null  int64  
 1   lead_time                       78290 non-null  int64  
 2   stays_in_weekend_nights         78290 non-null  int64  
 3   stays_in_week_nights            78290 non-null  int64  
 4   adults                          78290 non-null  int64  
 5   children                        78290 non-null  float64
 6   babies                          78290 non-null  int64  
 7   previous_cancellations          78290 non-null  int64  
 8   previous_bookings_not_canceled  78290 non-null  int64  
 9   booking_changes                 78290 non-null  int64  
 10  days_in_waiting_list            78290 non-null  int64  
 11  adr                             78290 non-null  float64
 12  required_car_parking_spaces     