In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
import joblib

# Step 1: Collect and Explore Data
# 1.1 Choose a dataset
# Load the dataset
csv_path = "C:/Users/admin/Documents/Muya/hotel_bookings.csv"
df = pd.read_csv(csv_path)

# Display basic information about the dataset
print(df.info())

# Display missing values
print(df.isnull().sum())

# Handle missing values
df = df.drop(columns=['company'])
df['children'].fillna(df['children'].median(), inplace=True)
df['country'].fillna(df['country'].mode()[0], inplace=True)
df['agent'].fillna(0, inplace=True)

print("After handling missing values:")
print(df.head())

# Feature scaling
scaler = StandardScaler()
cols_to_scale = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights',
                 'adults', 'children', 'babies', 'previous_cancellations', 'previous_bookings_not_canceled',
                 'booking_changes', 'agent', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces',
                 'total_of_special_requests']
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# Save feature names before scaling
feature_names = df.columns.tolist()

print("After feature scaling:")
print(df.head())

# Separate features (X) and target variable (y)
X = df.drop(columns=['is_canceled', 'reservation_status', 'reservation_status_date'])
y = df['is_canceled']

# Encode categorical variables
X = pd.get_dummies(X, columns=['hotel', 'meal', 'country', 'market_segment',
                               'distribution_channel', 'reserved_room_type', 'assigned_room_type',
                               'deposit_type', 'customer_type', 'arrival_date_month'])

# Display the columns after encoding
print("After encoding categorical variables:")
print(X.columns)

# Save the feature names in a file
with open('feature_names.txt', 'w') as file:
    file.write('\n'.join(feature_names))

# Step 3: Train models
# 3.1 Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3.2 Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),  # Increase max_iter
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# 3.3 Train models
for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)

# Step 4: Evaluate models
for name, classifier in classifiers.items():
    y_pred = classifier.predict(X_test)

    # Calculate and print evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    print(f"Metrics for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print("\n")

# Step 5: Save model
best_model_logistic_regression = classifiers['Logistic Regression']
joblib.dump(best_model_logistic_regression, 'best_model_logistic_regression.pkl')
