In [103]:
# Step 1: Collect and Explore Data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

# Load the dataset
csv_path = "C:/Users/admin/Documents/Muya/hotel_bookings.csv"
df = pd.read_csv(csv_path)

# Display basic information about the dataset
print(df.info())

# Display missing values
print(df.isnull().sum())

# Handle missing values
df = df.drop(columns=['company'])
df['children'].fillna(df['children'].median(), inplace=True)
df['country'].fillna(df['country'].mode()[0], inplace=True)
df['agent'].fillna(0, inplace=True)

# Feature scaling
scaler = StandardScaler()
cols_to_scale = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights',
                 'adults', 'children', 'babies', 'previous_cancellations', 'previous_bookings_not_canceled',
                 'booking_changes', 'agent', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces',
                 'total_of_special_requests']
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# Step 3: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train and Evaluate Models
# Initialize Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=5000, solver='liblinear')

# Train the model
logistic_regression_model.fit(X_train, y_train)

# Evaluate the model
y_pred = logistic_regression_model.predict(X_test)

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Metrics for Logistic Regression Model:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# Save the scaler
joblib.dump(scaler, 'scaler.joblib')

# Step 5: Save Model
joblib.dump(logistic_regression_model, 'best_model_logistic_regression.pkl')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

['best_model_logistic_regression.pkl']