# Step 1: Import Required Libraries

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import accuracy_score


# Step 2: Load the Datasets

In [3]:
# Load datasets
breast_cancer_df = pd.read_csv('breast_cancer_dataset.csv')
heart_disease_df = pd.read_csv('heart-dataset.csv')
iris_df = pd.read_csv('iris-dataset.csv')
wine_quality_df = pd.read_csv('wine-quality-dataset.csv')

# Store datasets in a dictionary
datasets = {
    "Breast Cancer": breast_cancer_df,
    "Heart Disease": heart_disease_df,
    "Iris": iris_df,
    "Wine Quality": wine_quality_df
}


# Step 3: Preprocess the Datasets

In [8]:
def preprocess_dataset(name, df):
    # Remove unnecessary columns
    if name == "Iris":
        df = df.drop(columns=["Id"])  # Remove Id column
    if name == "Breast Cancer":
        df.columns = ["ID", "Clump_Thickness", "Uniformity_Cell_Size", "Uniformity_Cell_Shape",
                      "Marginal_Adhesion", "Single_Epithelial_Cell_Size", "Bare_Nuclei",
                      "Bland_Chromatin", "Normal_Nucleoli", "Mitoses", "Class"]
        df = df.drop(columns=["ID"])  # Remove ID column

    # Define target variable
    target = {
        "Breast Cancer": "Class",
        "Heart Disease": "target",
        "Iris": "Species",
        "Wine Quality": "Wine"
    }[name]

    # Convert categorical target to numeric if needed
    if df[target].dtype == 'object':
        df[target] = LabelEncoder().fit_transform(df[target])

    # Split into features and target
    X = df.drop(columns=[target])
    y = df[target]

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, X.columns

# Preprocess all datasets
processed_data = {name: preprocess_dataset(name, df) for name, df in datasets.items()}


# Step 4: Apply Feature Selection and Train Models

In [9]:
def evaluate_feature_selection(name, X_train, X_test, y_train, y_test, feature_names, num_features=5):
    results = {}
    model = LogisticRegression(max_iter=10000)

    # SelectKBest (Filter Method)
    k_best = SelectKBest(score_func=f_classif, k=min(num_features, X_train.shape[1]))
    X_train_kbest = k_best.fit_transform(X_train, y_train)
    X_test_kbest = k_best.transform(X_test)
    selected_kbest_features = feature_names[k_best.get_support()]
    model.fit(X_train_kbest, y_train)
    acc_kbest = accuracy_score(y_test, model.predict(X_test_kbest))
    results["SelectKBest"] = (acc_kbest, selected_kbest_features.tolist())

    # RFE (Wrapper Method)
    rfe = RFE(estimator=LogisticRegression(max_iter=10000), n_features_to_select=min(num_features, X_train.shape[1]))
    X_train_rfe = rfe.fit_transform(X_train, y_train)
    X_test_rfe = rfe.transform(X_test)
    selected_rfe_features = feature_names[rfe.get_support()]
    model.fit(X_train_rfe, y_train)
    acc_rfe = accuracy_score(y_test, model.predict(X_test_rfe))
    results["RFE"] = (acc_rfe, selected_rfe_features.tolist())

    # Lasso Regression (Embedded Method)
    lasso = Lasso(alpha=0.01)
    lasso.fit(X_train, y_train)
    selected_features = np.where(lasso.coef_ != 0)[0]
    if len(selected_features) > 0:
        X_train_lasso = X_train[:, selected_features]
        X_test_lasso = X_test[:, selected_features]
        selected_lasso_features = feature_names[selected_features]
        model.fit(X_train_lasso, y_train)
        acc_lasso = accuracy_score(y_test, model.predict(X_test_lasso))
    else:
        acc_lasso = 0  # No features selected
        selected_lasso_features = []

    results["Lasso"] = (acc_lasso, selected_lasso_features)

    return results

# Apply feature selection and evaluate for all datasets
evaluation_results = {
    name: evaluate_feature_selection(name, *data) for name, data in processed_data.items()
}


# Step 5: Print the Results

In [12]:
from tabulate import tabulate

# Prepare data for the table
table_data = []
for dataset, results in evaluation_results.items():
    for method, (accuracy, features) in results.items():
        table_data.append([dataset, method, f"{accuracy:.4f}", ", ".join(features)])

# Convert to DataFrame
results_df = pd.DataFrame(table_data, columns=["Dataset", "Method", "Accuracy", "Top 5 Features"])

# Print the results in a table with a box style
print(tabulate(results_df, headers="keys", tablefmt="fancy_grid", showindex=False))


╒═══════════════╤═════════════╤════════════╤══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╕
│ Dataset       │ Method      │   Accuracy │ Top 5 Features                                                                                                                                                       │
╞═══════════════╪═════════════╪════════════╪══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡
│ Breast Cancer │ SelectKBest │     0.9714 │ Clump_Thickness, Uniformity_Cell_Size, Uniformity_Cell_Shape, Bare_Nuclei, Bland_Chromatin                                                                           │
├───────────────┼─────────────┼────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────