# Evaluation of results

This notebook demonstrates usage of the `tabpfn_evaluate` function for evaluating a dataset with TabPFN, outer cross-validation, and SHAP feature importances.

Change the `task` variable and `X`, `y` inputs to use your own data if desired.

In [None]:
from IPython.display import display, Markdown
import os
import pandas as pd
import numpy as np
from plotly import io as pio
from gemss.utils.utils import load_feature_lists_json
from gemss.postprocessing.result_postprocessing import get_unique_features
from gemss.postprocessing.tabpfn_evaluation import tabpfn_evaluate

pio.renderers.default = "notebook_connected"  # Ensures plotly plots show in notebooks

In [None]:
# original dataset
dataset_name = "shelflife_data_all.csv"
index_column_name = "sample ID"
label_column_name = "label"

# input data = output of GEMSS feature selection
experiment_id = 1
mydir = f"./results/experiment_{experiment_id}"
features_filename = f"{mydir}/all_candidate_solutions.json"

# output files for TabPFN evaluation results
evaluation_output_filename = f"{mydir}/tabpfn_evaluation_average_scores.csv"
importances_output_filename = f"{mydir}/tabpfn_feature_importances.csv"
# delete old file if it exists
if os.path.exists(evaluation_output_filename):
    os.remove(evaluation_output_filename)
if os.path.exists(importances_output_filename):
    os.remove(importances_output_filename)


# optionally select only numeric features
# GEMSS selector works only with numeric types
# but TabPFN can handle any type of data
only_numeric_features = True

# select the type of solutions to extract
solution_type = "Outlier features (STD_3.0)"
# solution_type = "Top features"
# solution_type = "Full features"

# evaluation settings:
# - always evaluate using all features found by all candidate solutions of a given type
#   ...good for a quick POC
# - each component (= candidate solution) can be evaluated separately
# - random baseline evaluates a model with randomly selected features of the same count as all unique features found in the selected solution
evaluate_each_component = False
compute_random_baseline = True

compute_shapley_explanations = (
    False  # can be costly; if True, n_folds must be at least 2
)
shap_sample_size = 50  # maximum number of samples for which to compute SHAP values (only if compute_shapley_explanations=True)

n_folds = 2  # number of outer CV folds (tests on unseen data)

In [None]:
# read the features file using utility loader
all_features_lists = load_feature_lists_json(features_filename)[0]
feature_dict_titles = list(all_features_lists.keys())

display(
    Markdown(
        f"**Feature lists loaded from** `{features_filename}`: {len(feature_dict_titles)} types of solutions available"
    )
)

df_overview = pd.DataFrame(
    index=[cname for cname in all_features_lists[feature_dict_titles[0]].keys()],
    columns=feature_dict_titles,
)
for title, feature_dict in all_features_lists.items():
    df_overview[title] = feature_dict

for i in range(df_overview.shape[0]):
    for j in range(df_overview.shape[1]):
        df_overview.iat[i, j] = len(df_overview.iat[i, j])

display(Markdown("**Numbers of features** in candidate solutions"))
display(df_overview)

In [None]:
# select which type of solution to display
solutions = all_features_lists[solution_type]

# get the number of components available
n_components = len(solutions)

solutions_df = pd.DataFrame()
max_length = max(len(clist) for clist in solutions.values())
for cname, clist in solutions.items():
    formatted_values = pd.Series(clist).apply(str)
    padded_series = pd.Series([None] * max_length)
    padded_series.iloc[: len(formatted_values)] = formatted_values.values
    solutions_df[cname] = padded_series

display(
    Markdown(f"### Experiment #{experiment_id}: {solution_type} candidate solutions")
)
display(solutions_df)

In [None]:
# read the dataset
df = pd.read_csv(f"../data/{dataset_name}", index_col=index_column_name)
allowed_missing_percentage = 80  # 0 - 100

# drop rows with missing labels
df = df.dropna(subset=[label_column_name])

# consider only numeric data
if only_numeric_features:
    df = df.select_dtypes(["number"]).astype(float)

# separate response
y = df.pop(label_column_name)

# Get basic statistics about the dataset
features_description = pd.DataFrame(
    {
        "Valid Count": df.count().astype(int),
        "Missing Count": df.isnull().sum(),
        "Missing Percentage": ((df.isnull().sum() / len(df)) * 100).astype(int),
    }
)

# optional: exclude features with too many missing values
nan_features = features_description[
    features_description["Missing Percentage"] >= allowed_missing_percentage
].index
df = df.drop(nan_features, axis=1)

display(Markdown("**Dataset loaded:**"))
display(df.describe())

In [None]:
# get only features that will be used
all_selected_features = get_unique_features(solutions)
X_df = df[all_selected_features]

# show info
display(Markdown("**Selected features:**"))
display(X_df.describe())
# X_df.info()

# Run TabPFN Evaluation with optional SHAP

- Outer cross-validation
- Feature scaling
- SHAP explanations
- Prints metrics for each fold

> For large X, SHAP explanations may take time. For a quick demo, use a small subset or reduce folds.

In [None]:
results = {}

solutions["all_selected_features"] = all_selected_features

if evaluate_each_component:
    for component, feature_list in solutions.items():
        display(Markdown(f"## {component.upper()}: {len(feature_list)} features"))
        results[component] = tabpfn_evaluate(
            X_df[feature_list],
            y,
            apply_scaling=None,
            outer_cv_folds=n_folds,
            tabpfn_kwargs=None,
            random_state=42,
            verbose=True,
            explain=compute_shapley_explanations,
            shap_sample_size=shap_sample_size,
        )
        display(
            Markdown(
                "----------------------------------------------------------------\n"
            )
        )
else:
    # compute using all discovered features
    # quick validation to verify the information is contained in them
    results["all_selected_features"] = tabpfn_evaluate(
        X_df[all_selected_features],
        y,
        apply_scaling=None,
        outer_cv_folds=n_folds,
        tabpfn_kwargs=None,
        random_state=42,
        verbose=True,
        explain=compute_shapley_explanations,
        shap_sample_size=shap_sample_size,
    )

In [None]:
if compute_random_baseline:
    # get a random subset of features of the same size for comparison
    # exclude the selected features
    random_features = np.random.choice(
        df.columns.difference(all_selected_features),
        size=len(all_selected_features),
        replace=False,
    )
    X_random = df[random_features]

    results[f"random_features_{len(random_features)}"] = tabpfn_evaluate(
        X_random,
        y,
        apply_scaling=None,
        outer_cv_folds=n_folds,
        tabpfn_kwargs=None,
        random_state=42,
        verbose=True,
        explain=compute_shapley_explanations,
        shap_sample_size=shap_sample_size,
    )

## CV Results: Average Metrics

In [None]:
df_average_scores = pd.DataFrame()

for cname, cresults in results.items():
    df_average_scores[cname] = pd.Series(cresults["average_scores"])

df_average_scores.to_csv(evaluation_output_filename)

display(df_average_scores)

## Feature Importances (SHAP, mean per fold)
Each dictionary below shows mean absolute SHAP values for features in a CV fold.

In [None]:
for cname, cresults in results.items():
    if "shap_explanations_per_fold" in cresults.keys():
        component_title = f"{cname.upper()}: feature importances"
        display(Markdown(f"## {component_title}"))
        with open(importances_output_filename, "a") as f:
            print(component_title, file=f)
            for fold, shap_imp in enumerate(
                cresults.get("shap_explanations_per_fold", [])
            ):
                fold_importances = pd.Series(shap_imp).sort_values(ascending=False)
                fold_title = f"Fold {fold+1} SHAP Feature Importances:"
                print(fold_importances, file=f)
                display(Markdown(f"### {fold_title}"))
                display(fold_importances)
                display(
                    Markdown(
                        "----------------------------------------------------------------\n"
                    )
                )