In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib
import os

In [2]:
directory = "C:/Users/HP/Documents/Hotel"
if not os.path.exists(directory):
    os.makedirs(directory)

In [3]:
# Load the dataset
csv_path = "C:/Users/HP/Documents/Hotel/hotel_bookings.csv"
df = pd.read_csv(csv_path)

In [4]:
# Explore the Dataset
print(df.head())
print(df.info())
print(df.describe())

          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0        342               2015               July   
1  Resort Hotel            0        737               2015               July   
2  Resort Hotel            0          7               2015               July   
3  Resort Hotel            0         13               2015               July   
4  Resort Hotel            0         14               2015               July   

   arrival_date_week_number  arrival_date_day_of_month  \
0                        27                          1   
1                        27                          1   
2                        27                          1   
3                        27                          1   
4                        27                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  deposit_type  \
0                        0                     0       2  ...    No Deposit   
1     

In [5]:
# Missing Values
print(df.isnull().sum())

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [6]:
# Handle Missing Values
# Fill missing values in 'children' with 0
df['children'].fillna(0, inplace=True)

# Fill missing values in 'country' with a default value
df['country'].fillna('Unknown', inplace=True)

# Drop 'agent' and 'company' columns due to a large number of missing values
df.drop(['agent', 'company'], axis=1, inplace=True)

# Verify that missing values have been handled
print(df.isnull().sum())

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests   

In [7]:
# Preprocess Categorical Variables
columns_to_encode = ['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type']
df_encoded = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)

# Verify the changes
print(df_encoded.head())

   is_canceled  lead_time  arrival_date_year  arrival_date_week_number  \
0            0        342               2015                        27   
1            0        737               2015                        27   
2            0          7               2015                        27   
3            0         13               2015                        27   
4            0         14               2015                        27   

   arrival_date_day_of_month  stays_in_weekend_nights  stays_in_week_nights  \
0                          1                        0                     0   
1                          1                        0                     0   
2                          1                        0                     1   
3                          1                        0                     1   
4                          1                        0                     2   

   adults  children  babies  ...  assigned_room_type_H  assigned_room_type_I  \


In [8]:
# Handle Missing Values in Encoded DataFrame
# Assuming 'children' is a numeric feature, fill missing values with the mean
df_encoded['children'].fillna(df_encoded['children'].mean(), inplace=True)

# Handle missing values in 'country_Unknown' (if it exists)
if 'country_Unknown' in df_encoded.columns:
    df_encoded['country_Unknown'].fillna(0, inplace=True)

# Drop any remaining rows with missing values
df_encoded.dropna(inplace=True)

# Display the data types to confirm all are numeric
print(df_encoded.dtypes)

is_canceled                      int64
lead_time                        int64
arrival_date_year                int64
arrival_date_week_number         int64
arrival_date_day_of_month        int64
                                 ...  
deposit_type_Non Refund          uint8
deposit_type_Refundable          uint8
customer_type_Group              uint8
customer_type_Transient          uint8
customer_type_Transient-Party    uint8
Length: 249, dtype: object


In [9]:
# Split the Data into Features (X) and Target Variable (y)
X = df_encoded.drop('is_canceled', axis=1)
y = df_encoded['is_canceled']

# Verify the shapes of X and y
print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (119390, 248)
y shape: (119390,)


In [10]:
# Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the shapes of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (95512, 248) (95512,)
Testing set shape: (23878, 248) (23878,)


In [11]:
# Print data types of the features in the training set
print(X_train.dtypes)


lead_time                        int64
arrival_date_year                int64
arrival_date_week_number         int64
arrival_date_day_of_month        int64
stays_in_weekend_nights          int64
                                 ...  
deposit_type_Non Refund          uint8
deposit_type_Refundable          uint8
customer_type_Group              uint8
customer_type_Transient          uint8
customer_type_Transient-Party    uint8
Length: 248, dtype: object


In [12]:
# Print data type of the target variable
print(y_train.dtypes)

print(y_train.unique())

int64
[1 0]


In [13]:
# Select categorical columns after splitting
categorical_columns = X_train.select_dtypes(include=['object']).columns

In [14]:
# One-hot encode categorical variables in both training and testing sets
X_train_encoded = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)

In [15]:
# Align the datasets to make sure they have the same columns
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='outer', axis=1, fill_value=0)

In [16]:
# Initialize the Random Forest model
random_forest_model = RandomForestClassifier(random_state=42)

In [17]:
# Fit the model to the training data
random_forest_model.fit(X_train_encoded, y_train)

In [18]:
# Make Predictions
predictions = random_forest_model.predict(X_test_encoded)

In [19]:
# Evaluate Model Performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

# Other metrics
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

Accuracy: 0.9999162408911969
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14907
           1       1.00      1.00      1.00      8971

    accuracy                           1.00     23878
   macro avg       1.00      1.00      1.00     23878
weighted avg       1.00      1.00      1.00     23878

[[14907     0]
 [    2  8969]]


In [20]:
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameter search space
param_distributions = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
}


# Use RandomizedSearchCV
randomized_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions,
    n_iter=5,  
    cv=5,
    random_state=42,
    n_jobs=-1
)

# Perform the randomized search
randomized_search.fit(X_train_encoded, y_train)

# Best hyperparameters
best_params = randomized_search.best_params_
print(f'Best Hyperparameters: {best_params}')


Best Hyperparameters: {'n_estimators': 100, 'max_depth': None}


In [21]:
# Access the best model directly from RandomizedSearchCV
best_model = randomized_search.best_estimator_

# Make predictions with the best model
best_predictions = best_model.predict(X_test_encoded)

# Print the best model details 
print(best_model)

RandomForestClassifier(random_state=42)


In [22]:
# Save the model
model_path = "C:/Users/HP/Documents/Hotel/best_random_forest_model.pkl"
try:
    joblib.dump(best_model, model_path)
    print("Model saved successfully.")
except Exception as e:
    print("Error saving the model:", str(e))

Model saved successfully.
