# Bayesian Sparse Feature Selection on an Unknown Dataset

This notebook demonstrates how to apply the Bayesian Sparse Feature Selector to an arbitrary dataset with unknown ground truth. It mirrors the workflow of demo.ipynb, but assumes no knowledge of the true supports. Feature names are anonymized during analysis and mapped back for interpretability.


In [None]:
# %pip install -e ..

In [None]:
from IPython.display import display, Markdown
import pandas as pd
import numpy as np
from plotly import io as pio
from sklearn.preprocessing import StandardScaler

import gemss.config as C
from gemss.feature_selection.inference import BayesianFeatureSelector
from gemss.diagnostics.visualizations import show_label_histogram, show_final_alphas
from gemss.diagnostics.result_postprocessing import (
    recover_solutions,
    show_algorithm_progress,
    get_long_solutions_df,
    show_regression_results_for_solutions,
)
from gemss.diagnostics.performance_tests import run_performance_diagnostics
from gemss.diagnostics.recommendations import display_recommendations

pio.renderers.default = "notebook_connected"  # Ensures plotly plots show in notebooks

# Set parameters

- The algorithm usually takes about 1+ minute per 1000 training iterations on CPU for the default 'sss' prior. The 'student' prior is faster.

In [None]:
# dataset parameters
# the CSV file should be in the ../data/ directory
# the index and label column names must be included in the dataset
csv_dataset_name = "shelflife_data_all_preprocessed.csv"
index_column_name = "sample ID"
label_column_name = "label"

# Apply standard scaling to features
apply_scaling = False

# Show plots of algorithm progress over iterations
show_search_history = True

In [None]:
# Set parameters
constants = {}

# Load and preprocess the dataset

In [None]:
df = pd.read_csv(f"../data/{csv_dataset_name}", index_col=index_column_name)
y = df.pop(label_column_name).values

In [None]:
constants = C.as_dict()

# Override the settings for this specific dataset
constants["N_SAMPLES"] = df.shape[0]
constants["N_FEATURES"] = df.shape[1]
constants["N_GENERATING_SOLUTIONS"] = np.inf

# Algorithm settings
constants["PRIOR_TYPE"] = "sss"  # 'sss', 'spike-and-slab', or 'student'
constants["PRIOR_SPARSITY"] = constants["DESIRED_SPARSITY"]
constants["VAR_SLAB"] = 100.0
constants["VAR_SPIKE"] = 0.001
constants["WEIGHT_SLAB"] = 0.9  # not used with 'sss' prior
constants["WEIGHT_SPIKE"] = 0.1  # not used with 'sss' prior
constants["STUDENT_DF"] = 1  # not used with 'sss' prior
constants["STUDENT_SCALE"] = 1.0  # not used with 'sss' prior
constants["LEARNING_RATE"] = 0.002
constants["BATCH_SIZE"] = 16
constants["N_ITER"] = 3000  # number of training iterations.
constants["IS_REGULARIZED"] = True
constants["LAMBDA_JACCARD"] = 500.0

# Solution settings
constants["N_CANDIDATE_SOLUTIONS"] = (
    8  # Number of mixture components (candidate solutions)
)
constants["DESIRED_SPARSITY"] = 6  # Expected # of features per solution
constants["MIN_MU_THRESHOLD"] = 0.25  # minimum |Î¼| to consider a feature nonzero

### Rename features

In [None]:
# Define feature renaming dictionaries
original_feature_names = list(df.columns)

name_to_feature = {
    orig: f"feature_{i}" for i, orig in enumerate(original_feature_names)
}
feature_to_name = {v: k for k, v in name_to_feature.items()}

df = df.rename(columns=name_to_feature)
# display(Markdown("**Feature renaming dictionary:**"))
# display(Markdown(f"```{name_to_feature}```"))

### Optional: apply standard scaling

In [None]:
if apply_scaling:
    scaler = StandardScaler()
    X = scaler.fit_transform(df.values)
    display(Markdown("Applied standard scaling to features."))
else:
    X = df.values
    display(Markdown("No scaling applied to features."))

# Run the feature selector

In [None]:
selector = BayesianFeatureSelector(
    n_features=constants["N_FEATURES"],
    n_components=constants["N_CANDIDATE_SOLUTIONS"],
    X=X,
    y=y,
    prior=constants["PRIOR_TYPE"],
    sss_sparsity=constants["PRIOR_SPARSITY"],
    var_slab=constants["VAR_SLAB"],
    var_spike=constants["VAR_SPIKE"],
    weight_slab=constants["WEIGHT_SLAB"],
    weight_spike=constants["WEIGHT_SPIKE"],
    student_df=constants["STUDENT_DF"],
    student_scale=constants["STUDENT_SCALE"],
    lr=constants["LEARNING_RATE"],
    batch_size=constants["BATCH_SIZE"],
    n_iter=constants["N_ITER"],
)

history = selector.optimize(
    regularize=constants["IS_REGULARIZED"],
    lambda_jaccard=constants["LAMBDA_JACCARD"],
    verbose=True,
)

if show_search_history:
    show_algorithm_progress(
        history,
        original_feature_names_mapping=feature_to_name,
    )

# Show the results

In [None]:
solutions, final_parameters, full_nonzero_solutions = recover_solutions(
    search_history=history,
    desired_sparsity=constants["DESIRED_SPARSITY"],
    min_mu_threshold=constants["MIN_MU_THRESHOLD"],
    verbose=True,
    original_feature_names_mapping=feature_to_name,
)

### Overview of full solutions

In [None]:
show_final_alphas(
    history,
    show_bar_plot=False,
    show_pie_chart=True,
)

In [None]:
df_full_solutions = get_long_solutions_df(full_nonzero_solutions)
display(df_full_solutions)

# Final Selected Solutions

In [None]:
display(Markdown(f"**Required sparsity** = {constants['DESIRED_SPARSITY']}"))
for component, features in solutions.items():
    i = component.split("_")[-1]
    alpha = history["alpha"][-1][int(i)]
    display(Markdown(f"## Candidate solution no. {i}:"))
    display(Markdown(f"**Component weight** = {alpha:.3f}"))
    for feature in features:
        display(Markdown(f"- {feature}"))


# Print unique features
unique_features = set()

for _, features in solutions.items():
    unique_features.update(features)

display(
    Markdown(
        f"## Unique features across all {len(solutions)} solutions: {len(unique_features)} total"
    )
)
display(Markdown(f"```{sorted(unique_features)}```"))

In [None]:
df = df.rename(columns=feature_to_name)

show_regression_results_for_solutions(
    solutions,
    df=df,
    y=y,
    penalty="l1",
    verbose=True,
)

# Feature selector's performance tests

In [None]:
diagnostics = run_performance_diagnostics(
    history,
    desired_sparsity=constants["DESIRED_SPARSITY"],
    verbose=True,
)

In [None]:
display_recommendations(
    diagnostics=diagnostics,
    constants=constants,
)