# Bayesian Sparse Feature Selection on an Unknown Dataset

This notebook demonstrates how to apply the Bayesian Sparse Feature Selector to an arbitrary dataset with unknown ground truth. It mirrors the workflow of demo.ipynb, but assumes no knowledge of the true supports. Feature names are anonymized during analysis and mapped back for interpretability.


In [None]:
# %pip install -e ..

In [None]:
from IPython.display import display, Markdown
from typing import List, Tuple, Dict, Any
import pandas as pd
import numpy as np
import os
from pathlib import Path
import pprint

from sklearn.preprocessing import StandardScaler

from gemss.visualizations import show_label_histogram, show_final_alphas
from gemss.inference import BayesianFeatureSelector
from gemss.result_postprocessing import (
    recover_solutions,
    show_algorithm_progress,
    get_long_solutions_df,
)

# Set parameters

- The algorithm usually takes about 1+ minute per 1000 training iterations on CPU for the default 'sss' prior. The 'student' prior is faster.

In [None]:
# preprocessing parameters
apply_scaling = True  # Set to False to skip scaling

# show plots of algorithm progress over iterations
show_search_history = True

In [None]:
# Set parameters
DESIRED_SPARSITY = 4  # Expected # of features per solution
N_COMPONENTS = 6  # Number of mixture components (solutions)
PRIOR_TYPE = "sss"  # 'sss', 'spike-and-slab', or 'student'
PRIOR_SPARSITY = DESIRED_SPARSITY
VAR_SLAB = 100.0
VAR_SPIKE = 0.0001
WEIGHT_SLAB = 0.9  # not used with 'sss' prior
WEIGHT_SPIKE = 0.1  # not used with 'sss' prior
STUDENT_DF = 1  # not used with 'sss' prior
STUDENT_SCALE = 1.0  # not used with 'sss' prior
LEARNING_RATE = 0.002
BATCH_SIZE = 16

N_ITER = 2000  # number of training iterations.
IS_REGULARIZED = True
LAMBDA_JACCARD = 500.0
MIN_MU_THRESHOLD = 0.4

# Load and preprocess the dataset

In [None]:
df = pd.read_csv("../data/shelflife_data_all_preprocessed.csv", index_col="sample ID")
y = df.pop("label").values

N_SAMPLES = df.shape[0]
N_FEATURES = df.shape[1]

In [None]:
# Print data overview
display(Markdown("## Dataset Overview"))
display(Markdown(f"**Number of features:** {N_FEATURES}"))
display(Markdown(f"**Number of samples:** {N_SAMPLES}"))

display(Markdown("### Label distribution"))
if len(np.unique(y)) <= 4:
    display(
        pd.Series(y, name="label values")
        .value_counts(normalize=True)
        .map("{:.2%}".format)
    )
else:
    show_label_histogram(y)

display(Markdown("### Features"))
display(df.info())

### Rename features

In [None]:
# Define feature renaming dictionaries
original_feature_names = list(df.columns)

name_to_feature = {
    orig: f"feature_{i}" for i, orig in enumerate(original_feature_names)
}
feature_to_name = {v: k for k, v in name_to_feature.items()}

df = df.rename(columns=name_to_feature)
# display(Markdown("**Feature renaming dictionary:**"))
# display(Markdown(f"```{name_to_feature}```"))

### Optional: apply standard scaling

In [None]:
if apply_scaling:
    scaler = StandardScaler()
    X = scaler.fit_transform(df.values)
    display(Markdown("Applied standard scaling to features."))
else:
    X = df.values
    display(Markdown("No scaling applied to features."))

# Run the feature selector

In [None]:
selector = BayesianFeatureSelector(
    n_features=N_FEATURES,
    n_components=N_COMPONENTS,
    X=df.values,
    y=y,
    prior=PRIOR_TYPE,
    sss_sparsity=PRIOR_SPARSITY,
    var_slab=VAR_SLAB,
    var_spike=VAR_SPIKE,
    weight_slab=WEIGHT_SLAB,
    weight_spike=WEIGHT_SPIKE,
    student_df=STUDENT_DF,
    student_scale=STUDENT_SCALE,
    lr=LEARNING_RATE,
    batch_size=BATCH_SIZE,
    n_iter=N_ITER,
)

history = selector.optimize(
    regularize=IS_REGULARIZED,
    lambda_jaccard=LAMBDA_JACCARD,
    verbose=True,
)

if show_search_history:
    show_algorithm_progress(
        history,
        original_feature_names_mapping=feature_to_name,
    )

# Show the results

In [None]:
solutions, final_parameters, full_nonzero_solutions = recover_solutions(
    search_history=history,
    desired_sparsity=DESIRED_SPARSITY,
    min_mu_threshold=MIN_MU_THRESHOLD,
    verbose=True,
    original_feature_names_mapping=feature_to_name,
)

### Overview of full solutions

In [None]:
show_final_alphas(
    history,
    show_bar_plot=False,
    show_pie_chart=True,
)

In [None]:
df_full_solutions = get_long_solutions_df(full_nonzero_solutions)
display(df_full_solutions)

# Final Selected Solutions

In [None]:
display(Markdown(f"**Required sparsity** = {DESIRED_SPARSITY}"))
for component, features in solutions.items():
    i = component.split("_")[-1]
    alpha = history["alpha"][-1][int(i)]
    display(Markdown(f"## Candidate solution no. {i}:"))
    display(Markdown(f"**Component weight** = {alpha:.3f}"))
    for feature in features:
        display(Markdown(f"- {feature}"))