In [None]:
import streamlit as st
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score, roc_curve, roc_auc_score, confusion_matrix

# Import PyCaret
from pycaret.classification import setup as setup_classification, compare_models as compare_models_classification
from pycaret.regression import setup as setup_regression, compare_models as compare_models_regression


def perform_eda(data):
    st.header("Exploratory Data Analysis (EDA):-")
    st.subheader("Data Types")
    st.write(data.dtypes)

    analyze_data = st.checkbox("Perform EDA?")
    if analyze_data:
        visualiz_columns = st.multiselect("Select the columns for visualization:", options=data.columns)
        if visualiz_columns:
            numeric_columns = data[visualiz_columns].select_dtypes(include=['number']).columns
            st.subheader("Histograms")
            for column in numeric_columns:
                plt.figure(figsize=(8, 4))
                sns.histplot(data=data, x=column, kde=True)
                plt.title(f"Histogram For numerical data - {column}")
                plt.xlabel(column)
                plt.ylabel("Frequency of values")
                plt.show()
                st.pyplot()
        st.subheader("Boxplot")
        for column in numeric_columns:
            plt.figure(figsize=(8, 4))
            sns.boxplot(data=data, y=column)
            plt.title(f"Box Plot For numerical data - {column}")
            plt.ylabel(column)
            plt.show()
            st.pyplot()
        st.subheader("Scatterplot")
        if len(numeric_columns) >= 2:
            for i in range(len(numeric_columns)):
                for j in range(i + 1, len(numeric_columns)):
                    plt.figure(figsize=(8, 6))
                    sns.scatterplot(data=data, x=numeric_columns[i], y=numeric_columns[j])
                    plt.title(f"Scatter Plot VS {numeric_columns[i]} and {numeric_columns[j]}")
                    plt.xlabel(numeric_columns[i])
                    plt.ylabel(numeric_columns[j])
                    plt.show()
                    st.pyplot()
        st.subheader("Correlation Matrix")
        if len(numeric_columns) >= 2:
            correlation_matrix = data[visualiz_columns].corr()
            plt.figure(figsize=(10, 8))
            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
            plt.title("Heatmap For correlation coefficients between numerical features")
            plt.show()
            st.pyplot()


def encode_categorical(data):
    categorical_features = data.select_dtypes(include=['object']).columns
    encoding_method = st.radio("Select encoding method for categorical data:", ("Label Encoding", "One-Hot Encoding"))
    if encoding_method == "Label Encoding":
        label_encoders = {}
        for col in categorical_features:
            label_encoders[col] = LabelEncoder()
            data[col] = label_encoders[col].fit_transform(data[col])
    elif encoding_method == "One-Hot Encoding":
        data = pd.get_dummies(data, columns=categorical_features)
    return data


def drop_duplicates(data):
    st.header("Select Drop Duplicate Rows?")
    drop_duplicates_option = st.checkbox("Drop duplicate", key="drop_duplicates_checkbox")

    if drop_duplicates_option:
        data.drop_duplicates(inplace=True)

    return data


def choose_variables(data):
    st.header("Choose X and Y variables")
    X_variables = st.multiselect("Select independent variables (X):", options=data.columns)
    Y_variable = st.selectbox("Select dependent variable (Y):", options=data.columns)
    return X_variables, Y_variable


def normalize_features(X, scaler_type='standard'):
    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'minmax':
        scaler = MinMaxScaler()

    X_normalized = scaler.fit_transform(X)
    return X_normalized


def main():
    with st.sidebar:
        st.header("Steps to prediction accuracy:-")
        st.text("1- Upload CSV or Excel file ")
        st.text("2- Choose target feature")
        st.text("3- Perform some EDA")
        st.text("4- Handle missing values")
        st.text("5- Drop duplicates")
        st.text("6- Choose X and Y ")
        st.text("7- Split (X, Y) for train/test")
        st.text("8- Encode categorical data ")
        st.text("9- Normalize data")

    data = pd.DataFrame()
    target = ""

    dataset = st.file_uploader("Upload CSV or Excel file", type=['csv', 'xlsx'])
    if dataset is not None:
        if "csv" in dataset.name:
            data = pd.read_csv(dataset)
        elif "xlsx" in dataset.name:
            data = pd.read_excel(dataset)
        st.write(data.head())
        st.write(data.shape)

        target = st.selectbox("Choose the target variable:", options=data.columns)

        perform_eda(data)

        data = encode_categorical(data)

        drop_duplicates(data)

        select_columns = st.multiselect("Select features to remove from the dataframe:", options=data.columns)
        if select_columns:
            data.drop(select_columns, axis=1, inplace=True)

        X_variables, Y_variable = choose_variables(data)

        numerical_features = data.select_dtypes(['int64', 'float64']).columns
        categorical_feature = data.select_dtypes(['object']).columns
        missing_value_num = st.radio("Set missing value for numerical value", ["mean", "median"])
        missing_value_cat = st.radio("Set missing value for categorical value", ['most frequent', 'additional class'])

        # Handle missing values for numerical columns
        for col in numerical_features:
            data[col] = SimpleImputer(strategy=missing_value_num, missing_values=np.nan).fit_transform(data[col].values.reshape(-1, 1))

        # Handle missing values for categorical columns
        for col in categorical_feature:
            if missing_value_cat == 'most frequent':
                data[col] = SimpleImputer(strategy='most_frequent', missing_values=np.nan).fit_transform(data[col].values.reshape(-1, 1))
            elif missing_value_cat == 'additional class':
                data[col] = data[col].fillna('Missing')

        st.subheader("Data Types")
        st.write(data.dtypes)

        # Detect task type based on the target variable
        if pd.api.types.is_numeric_dtype(data[target]):
            task_type = "Regression"
        else:
            task_type = "Classification"

        st.write(f"Detected Task: {task_type}")

        # Use PyCaret to train models
        if task_type == "Classification":
            st.write("Running PyCaret Classification")
            clf_setup = setup_classification(data=data, target=target, silent=True, session_id=42)
            best_model = compare_models_classification()
            st.write("Best Classification Model:")
            st.write(best_model)

        elif task_type == "Regression":
            st.write("Running PyCaret Regression")
            reg_setup = setup_regression(data=data, target=target, silent=True, session_id=42)
            best_model = compare_models_regression()
            st.write("Best Regression Model:")
            st.write(best_model)


if __name__ == "__main__":
    main()