In [2]:
!pip install tabulate
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import accuracy_score
from tabulate import tabulate
from sklearn.inspection import permutation_importance



In [3]:
# Load datasets
breast_cancer_df = pd.read_csv('breast_cancer_dataset.csv')
heart_disease_df = pd.read_csv('heart-dataset.csv')
iris_df = pd.read_csv('iris-dataset.csv')
wine_quality_df = pd.read_csv('wine-quality-dataset.csv')

In [4]:
# Store datasets in a dictionary
datasets = {
    "Breast Cancer": breast_cancer_df,
    "Heart Disease": heart_disease_df,
    "Iris": iris_df,
    "Wine Quality": wine_quality_df
}

def preprocess_dataset(name, df):
    # Remove unnecessary columns
    if name == "Iris":
        df = df.drop(columns=["Id"])  # Remove Id column
    if name == "Breast Cancer":
        df.columns = ["ID", "Clump_Thickness", "Uniformity_Cell_Size", "Uniformity_Cell_Shape",
                      "Marginal_Adhesion", "Single_Epithelial_Cell_Size", "Bare_Nuclei",
                      "Bland_Chromatin", "Normal_Nucleoli", "Mitoses", "Class"]
        df = df.drop(columns=["ID"])  # Remove ID column

    # Define target variable
    target = {
        "Breast Cancer": "Class",
        "Heart Disease": "target",
        "Iris": "Species",
        "Wine Quality": "Wine"
    }[name]

    # Convert categorical target to numeric if needed
    if df[target].dtype == 'object':
        df[target] = LabelEncoder().fit_transform(df[target])

    # Split into features and target
    X = df.drop(columns=[target])
    y = df[target]

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, X.columns

In [5]:
# Preprocess all datasets
processed_data = {name: preprocess_dataset(name, df) for name, df in datasets.items()}

def evaluate_feature_selection(name, X_train, X_test, y_train, y_test, feature_names, num_features=5):
    results = {}
    model = LogisticRegression(max_iter=10000)
    feature_importances = {} #Initialize an empty dictionary to store feature importances

    # SelectKBest (Filter Method)
    k_best = SelectKBest(score_func=f_classif, k=min(num_features, X_train.shape[1]))
    X_train_kbest = k_best.fit_transform(X_train, y_train)
    X_test_kbest = k_best.transform(X_test)
    selected_kbest_features = feature_names[k_best.get_support()]
    model.fit(X_train_kbest, y_train)
    acc_kbest = accuracy_score(y_test, model.predict(X_test_kbest))
    results["SelectKBest"] = (acc_kbest, selected_kbest_features.tolist())

    # Calculate feature importance for SelectKBest
    kbest_scores = k_best.scores_ # Get the scores from SelectKBest
    kbest_feature_importance = {feature_names[i]: score for i, score in enumerate(kbest_scores) if feature_names[i] in selected_kbest_features}
    feature_importances["SelectKBest"] = kbest_feature_importance

    # RFE (Wrapper Method)
    rfe = RFE(estimator=LogisticRegression(max_iter=10000), n_features_to_select=min(num_features, X_train.shape[1]))
    X_train_rfe = rfe.fit_transform(X_train, y_train)
    X_test_rfe = rfe.transform(X_test)
    selected_rfe_features = feature_names[rfe.get_support()]
    model.fit(X_train_rfe, y_train)
    acc_rfe = accuracy_score(y_test, model.predict(X_test_rfe))
    results["RFE"] = (acc_rfe, selected_rfe_features.tolist())

    #Calculate feature importance for RFE
    rfe_feature_ranking = rfe.ranking_ # Get the ranking from RFE
    rfe_feature_importance = {feature_names[i]: 1/rank for i, rank in enumerate(rfe_feature_ranking) if feature_names[i] in selected_rfe_features}
    feature_importances["RFE"] = rfe_feature_importance


    # Lasso Regression (Embedded Method)
    lasso = Lasso(alpha=0.01)
    lasso.fit(X_train, y_train)
    selected_features = np.where(lasso.coef_ != 0)[0]
    if len(selected_features) > 0:
        X_train_lasso = X_train[:, selected_features]
        X_test_lasso = X_test[:, selected_features]
        selected_lasso_features = feature_names[selected_features]
        model.fit(X_train_lasso, y_train)
        acc_lasso = accuracy_score(y_test, model.predict(X_test_lasso))
    else:
        acc_lasso = 0  # No features selected
        selected_lasso_features = []

    results["Lasso"] = (acc_lasso, selected_lasso_features)

    # Calculate feature importance for Lasso
    lasso_coefficients = lasso.coef_ # Get the coefficients from Lasso
    lasso_feature_importance = {feature_names[i]: abs(coef) for i, coef in enumerate(lasso_coefficients) if feature_names[i] in selected_lasso_features}
    feature_importances["Lasso"] = lasso_feature_importance

    return results, feature_importances # Return the results and the feature importances

# Apply feature selection and evaluate for all datasets
evaluation_results = {}
feature_importance_scores = {} # Dictionary to hold the results from all datasets
for name, data in processed_data.items():
    evaluation_results[name], feature_importance_scores[name] = evaluate_feature_selection(name, *data)

In [6]:
# Prepare data for the table
table_data = []
for dataset, results in evaluation_results.items():
    for method, (accuracy, features) in results.items():
        table_data.append([dataset, method, f"{accuracy:.4f}", ", ".join(features)])

# Convert to DataFrame
results_df = pd.DataFrame(table_data, columns=["Dataset", "Method", "Accuracy", "Top 5 Features"])

# Print the results in a table with a box style
print(tabulate(results_df, headers="keys", tablefmt="fancy_grid", showindex=False))

# Prepare data for the table
table_data = []
for method in ["SelectKBest", "RFE", "Lasso"]:
    for dataset, results in evaluation_results.items():
        if dataset.lower() != "iris":  # Skip the Iris dataset
            accuracy, features = results[method]

            #Normalizing feature scores for each method so they sum up to 100%
            feature_scores = feature_importance_scores[dataset][method]
            total_score = sum(feature_scores.values())
            normalized_scores = {feature: (score / total_score) * 100 for feature, score in feature_scores.items()}

            # Get the percentage scores for the selected features
            selected_feature_scores = {feature: normalized_scores[feature] for feature in features}

            # Format the features with percentages
            formatted_features = [f"{feature} ({score:.2f}%)" for feature, score in selected_feature_scores.items()]

            table_data.append([method, dataset, f"{accuracy:.4f}", ", ".join(formatted_features)])

# Convert to DataFrame
results_df = pd.DataFrame(table_data, columns=["Method", "Dataset", "Accuracy", "Top 5 Features (with Scores)"])

# Sort by Method first, then by Accuracy (descending)
method_order = {"SelectKBest": 1, "RFE": 2, "Lasso": 3}
results_df["Method Rank"] = results_df["Method"].map(method_order)
results_df = results_df.sort_values(by=["Method Rank", "Accuracy"], ascending=[True, False]).drop(columns=["Method Rank"])

# Print the table with a clean format
print(tabulate(results_df, headers="keys", tablefmt="fancy_grid", showindex=False))

╒═══════════════╤═════════════╤════════════╤══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╕
│ Dataset       │ Method      │   Accuracy │ Top 5 Features                                                                                                                                                       │
╞═══════════════╪═════════════╪════════════╪══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡
│ Breast Cancer │ SelectKBest │     0.9714 │ Clump_Thickness, Uniformity_Cell_Size, Uniformity_Cell_Shape, Bare_Nuclei, Bland_Chromatin                                                                           │
├───────────────┼─────────────┼────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────