# Bayesian Sparse Feature Selection Demo

This notebook demonstrates the Bayesian Gaussian Mixture Feature Selection algorithm on artificial data.

**Features:**
- Modular prior (spike-and-slab, easily replaceable)
- Flexible mixture model
- Variational inference with PyTorch
- Interactive diagnostics with Plotly/Seaborn

> **Note:** Ensure that the `gemss` package and dependencies are installed.

In [None]:
# %pip install -q -e ..

In [None]:
# Add parent directory to Python path to find our package
import sys
import os
from pathlib import Path

parent_dir = Path(os.path.dirname(os.getcwd()))
if parent_dir not in sys.path:
    sys.path.insert(0, str(parent_dir))

In [None]:
import numpy as np
from IPython.display import display, Markdown
import pandas as pd
import plotly.io as pio

import gemss.config as C
from gemss.data_handling.generate_artificial_dataset import (
    generate_artificial_dataset,
)
from gemss.feature_selection.inference import BayesianFeatureSelector
from gemss.diagnostics.visualizations import (
    show_correlation_matrix,
    show_features_in_components,
)
from gemss.diagnostics.performance_tests import run_performance_diagnostics
from gemss.diagnostics.recommendations import display_recommendations
from gemss.diagnostics.result_postprocessing import (
    display_features_overview,
    get_long_solutions_df,
    get_unique_features,
    recover_solutions,
    show_final_parameter_comparison,
    show_algorithm_progress,
    show_regression_results_for_solutions,
    show_solutions_details,
    show_long_solutions,
)
from gemss.diagnostics.simple_regressions import (
    solve_with_logistic_regression,
    solve_with_linear_regression,
)
from gemss.diagnostics.visualizations import show_final_alphas

pio.renderers.default = "notebook_connected"  # Ensures plotly plots show in notebooks

# Generate Artificial Dataset

In [None]:
# Generate dataset
df, y, generating_solutions, parameters = generate_artificial_dataset(
    n_samples=C.N_SAMPLES,
    n_features=C.N_FEATURES,
    n_solutions=C.N_GENERATING_SOLUTIONS,
    sparsity=C.SPARSITY,
    noise_data_std=C.NOISE_STD,
    binarize=C.BINARIZE,
    binary_response_ratio=C.BINARY_RESPONSE_RATIO,
    random_seed=C.DATASET_SEED,
    save_to_csv=False,
    print_data_overview=True,
    show_feature_correlations=False,
)

support_indices = parameters["support_indices"].sum()
true_support_features = [f"feature_{i}" for i in set(support_indices)]

# Classical approach
Solve the problem using logistic/linear regression with regularization.

In [None]:
# Solve the full problem (all features) with l1 penalty
# for penalty in ["l1", "l2"]:
for penalty in ["l1"]:
    if C.BINARIZE:
        solve_with_logistic_regression(X=df, y=y, penalty=penalty)
    else:
        solve_with_linear_regression(X=df, y=y, penalty=penalty)

In [None]:
# Show regression results only for the generating solutions
for penalty in ["l1", "l2"]:
    show_regression_results_for_solutions(
        solutions=generating_solutions,
        df=df,
        y=y,
        penalty=penalty,
        verbose=False,  # if true, shows detailed results for each solution
    )

# Bayesian Feature Selector

In [None]:
selector = BayesianFeatureSelector(
    n_features=C.N_FEATURES,
    n_components=C.N_CANDIDATE_SOLUTIONS,
    X=df.values,
    y=y,
    prior=C.PRIOR_TYPE,
    sss_sparsity=C.PRIOR_SPARSITY,
    sample_more_priors_coeff=C.SAMPLE_MORE_PRIORS_COEFF,
    var_slab=C.VAR_SLAB,
    var_spike=C.VAR_SPIKE,
    weight_slab=C.WEIGHT_SLAB,
    weight_spike=C.WEIGHT_SPIKE,
    student_df=C.STUDENT_DF,
    student_scale=C.STUDENT_SCALE,
    lr=C.LEARNING_RATE,
    batch_size=C.BATCH_SIZE,
    n_iter=C.N_ITER,
)

history = selector.optimize(
    regularize=C.IS_REGULARIZED,
    lambda_jaccard=C.LAMBDA_JACCARD,
    verbose=True,
)

In [None]:
show_algorithm_progress(history)

show_final_alphas(
    history,
    show_bar_plot=False,
    show_pie_chart=True,
)

In [None]:
solutions, final_parameters, full_nonzero_solutions = recover_solutions(
    search_history=history,
    desired_sparsity=C.DESIRED_SPARSITY,
    min_mu_threshold=C.MIN_MU_THRESHOLD,
    verbose=False,
)

show_long_solutions(full_nonzero_solutions, title="Full long solutions")

## Solution quality assessment

In [None]:
show_solutions_details(
    solutions=solutions,
    history=history,
    constants=C.as_dict(),
    use_markdown=True,
)

## Overview of discovered features

In [None]:
features_found = get_unique_features(solutions)

display_features_overview(
    features_found=features_found,
    true_support_features=true_support_features,
    n_total_features=len(df.columns),
)

## Comparison of the solutions

In [None]:
features_to_show = list(set(true_support_features).union(set(features_found)))
show_features_in_components(solutions, features_to_show=features_to_show)
show_correlation_matrix(df[sorted(features_to_show)], width=600, height=600)

In [None]:
# show_final_parameter_comparison(
#     true_parameters=parameters,
#     final_parameters=final_parameters,
# )

# Compute regression using features in solutions

In [None]:
for penalty in ["l1", "l2"]:
    show_regression_results_for_solutions(
        solutions=solutions,
        df=df,
        y=y,
        penalty=penalty,
        verbose=False,  # if true, shows detailed results for each solution
    )