In [2]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Print current working directory and list of files
print("Current Working Directory:", os.getcwd())
print("Files in Current Directory:", os.listdir())

# Load the model
model_path = "C:/Users/HP/Documents/Hotel/best_random_forest_model.pkl"
loaded_model = joblib.load(model_path)

# Streamlit App
st.title("Hotel Booking Prediction with Random Forest Model")

# Upload CSV data through Streamlit
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)

    # Sidebar for user input
    st.sidebar.title("User Input for Prediction")
    user_input = {}
    for column in df.columns:
        if column != "is_canceled":
            user_input[column] = st.sidebar.text_input(f"Enter {column}", df[column].mean())

    # Display the first few rows of the DataFrame
    st.subheader("Dataset Preview")
    st.write(df.head())

    # Summary statistics and missing values
    st.subheader("Dataset Summary")
    st.write(df.describe())
    st.write("Missing Values:")
    st.write(df.isnull().sum())

    # Preprocess Categorical Variables
    columns_to_encode = ['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type']
    df_encoded = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)

    # Verify the changes
    st.subheader("One-Hot Encoded DataFrame")
    st.write(df_encoded.head())

    # Handle Missing Values in Encoded DataFrame
    # Assuming 'children' is a numeric feature, fill missing values with the mean
    df_encoded['children'].fillna(df_encoded['children'].mean(), inplace=True)

    # Handle missing values in 'country_Unknown' (if it exists)
    if 'country_Unknown' in df_encoded.columns:
        df_encoded['country_Unknown'].fillna(0, inplace=True)

    # Drop any remaining rows with missing values
    df_encoded.dropna(inplace=True)

    # Display the data types to confirm all are numeric
    st.subheader("Final Encoded DataFrame")
    st.write(df_encoded.dtypes)

    # Visualization: Bar chart of the target variable distribution
    st.subheader("Target Variable Distribution")
    target_distribution = df_encoded["is_canceled"].value_counts()
    st.bar_chart(target_distribution)

    # Visualization: Correlation heatmap
    st.subheader("Correlation Heatmap")
    correlation_matrix = df_encoded.corr()
    st.write(correlation_matrix.style.background_gradient(cmap='coolwarm'))

    # Visualization: Histogram for a numeric feature
    st.subheader("Histogram for a Numeric Feature")
    numeric_feature = st.selectbox("Select a numeric feature for the histogram", df_encoded.select_dtypes(include=[np.number]).columns)
    plt.figure(figsize=(8, 6))
    plt.hist(df_encoded[numeric_feature], bins=30, color='blue', alpha=0.7)
    plt.xlabel(numeric_feature)
    plt.ylabel("Frequency")
    st.pyplot()

    # Visualization: Count plot for a categorical feature
    st.subheader("Count Plot for a Categorical Feature")
    categorical_feature = st.selectbox("Select a categorical feature for the count plot", df_encoded.select_dtypes(include=['object']).columns)
    plt.figure(figsize=(10, 6))
    sns.countplot(x=categorical_feature, data=df_encoded, palette="Set2")
    plt.xlabel(categorical_feature)
    plt.ylabel("Count")
    st.pyplot()

    # Prediction using the Random Forest model
    st.subheader("Random Forest Model Prediction")

    # Select features for prediction
    features = st.multiselect("Select features for prediction", df_encoded.columns)

    if st.button("Predict"):
        # User input for prediction
        user_input_df = pd.DataFrame([user_input])
        user_input_encoded = pd.get_dummies(user_input_df, columns=columns_to_encode, drop_first=True)

        # Align user input with the model features
        user_input_encoded, _ = df_encoded.align(user_input_encoded, join='outer', axis=1, fill_value=0)

        # Make prediction
        prediction = loaded_model.predict(user_input_encoded)
        st.write("Predicted Cancellation:", prediction[0])

    # Evaluation metrics
    st.subheader("Model Evaluation Metrics")
    y_test = df_encoded["is_canceled"]
    X_test_encoded = df_encoded.drop("is_canceled", axis=1)
    predictions = loaded_model.predict(X_test_encoded)

    st.write("Accuracy:", accuracy_score(y_test, predictions))
    st.write("Classification Report:")
    st.write(classification_report(y_test, predictions))
    st.write("Confusion Matrix:")
    st.write(confusion_matrix(y_test, predictions))

Current Working Directory: C:\Users\HP\Documents\Hotel
Files in Current Directory: ['.ipynb_checkpoints', 'best_random_forest_model.pkl', 'hotel_app.ipynb', 'hotel_bookings.csv', 'Hotel_training.py', 'hotel_training_code.ipynb', 'str_app.ipynb', 'Untitled.ipynb', 'Untitled1.ipynb']


2024-01-16 09:30:59.287 
  command:

    streamlit run C:\Users\HP\New folder\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
