In [2]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

# Load the pre-trained Logistic Regression model
logistic_regression_model = joblib.load('best_model_logistic_regression.pkl')

# App title and description
st.title("🏨 Hotel Booking Demand Prediction")
st.write("Uncover the future of hotel bookings! Input your data and let the magic happen.")

# Upload dataset
uploaded_file = st.file_uploader("📊 Upload your dataset (CSV format)", type=["csv"])
if uploaded_file is not None:
    # Load dataset
    df = pd.read_csv(uploaded_file)

    # Display basic information about the dataset
    st.subheader("Dataset Information")
    st.write(df.info())

    # Handle missing values
    df = df.drop(columns=['company'])
    df['children'].fillna(df['children'].median(), inplace=True)
    df['country'].fillna(df['country'].mode()[0], inplace=True)
    df['agent'].fillna(0, inplace=True)

    # Feature scaling
    scaler = StandardScaler()
    cols_to_scale = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights',
                     'adults', 'children', 'babies', 'previous_cancellations', 'previous_bookings_not_canceled',
                     'booking_changes', 'agent', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces',
                     'total_of_special_requests']
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

    # Separate features (X) and target variable (y)
    X = df.drop(columns=['is_canceled', 'reservation_status', 'reservation_status_date'])
    y = df['is_canceled']

    # Encode categorical variables
    X = pd.get_dummies(X, columns=['hotel', 'meal', 'country', 'market_segment',
                                   'distribution_channel', 'reserved_room_type', 'assigned_room_type',
                                   'deposit_type', 'customer_type', 'arrival_date_month'])

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Make predictions using Logistic Regression model
    y_pred = logistic_regression_model.predict(X_test)

    # Display evaluation metrics
    st.subheader("Model Evaluation Metrics")
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    st.write(f"Accuracy: {accuracy:.4f}")
    st.write(f"Precision: {precision:.4f}")
    st.write(f"Recall: {recall:.4f}")
    st.write(f"F1 Score: {f1:.4f}")
    st.write(f"ROC AUC: {roc_auc:.4f}")

    # Visualizations
    st.subheader("Data Visualizations")
    
    # Pairplot
    st.pyplot(sns.pairplot(df, hue="is_canceled"))

    # Correlation Heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
    st.pyplot(plt)

    # Bar Chart - Booking Cancellation Distribution
    st.subheader("Booking Cancellation Distribution")
    booking_cancelation_counts = df['is_canceled'].value_counts()
    st.bar_chart(booking_cancelation_counts)

# User Interaction Section
st.sidebar.title("User Input Section")

# Widgets for user interaction
user_lead_time = st.sidebar.slider("Lead Time (days)", min_value=0, max_value=500, value=100)
user_stays_nights = st.sidebar.slider("Stays in Nights", min_value=0, max_value=20, value=5)
