fANOVA on Average Prediction Set Size

In [21]:
from __future__ import annotations

import ast
import os
from pathlib import Path
from typing import Dict, List

import numpy as np

if not hasattr(np, "float"):
    np.float = np.float64  # type: ignore[attr-defined]
    
import pandas as pd
from ConfigSpace import ConfigurationSpace, CategoricalHyperparameter
from fanova import fANOVA
from fanova.visualizer import Visualizer

In [2]:
DATA_PATH: Path | str = "./data/all_universe_predictions.csv"

In [3]:
DECISION_COLUMNS: List[str] = [
    "feature_set",
    "model",
    # "threshold_policy",  # uncomment if this column exists
]
OUTPUT_DIR: Path | str | None = "fanova_avg_set_size_out"  # set to None to skip plots
SEED: int | None = 42  # for ConfigSpace reproducibility

In [4]:
df = pd.read_csv(DATA_PATH)

In [5]:
def _to_set(obj):
    if isinstance(obj, str):
        obj = ast.literal_eval(obj)
    return set(obj)

df["pred_set"] = df["pred_set"].apply(_to_set)
df["set_size"] = df["pred_set"].apply(len)

In [6]:
universe_df = (
    df.groupby(DECISION_COLUMNS)["set_size"].mean().reset_index(name="avg_set_size")
)

In [7]:
cs = ConfigurationSpace(seed=SEED)
for col in DECISION_COLUMNS:
    cs.add_hyperparameter(CategoricalHyperparameter(col, sorted(universe_df[col].unique())))

hp_order = [hp.name for hp in cs.get_hyperparameters()]


  cs.add_hyperparameter(CategoricalHyperparameter(col, sorted(universe_df[col].unique())))
  hp_order = [hp.name for hp in cs.get_hyperparameters()]


In [10]:
X_df = universe_df[hp_order].copy()
_encoders: Dict[str, Dict[str, int]] = {}
for col in hp_order:
    mapping = {val: idx for idx, val in enumerate(sorted(X_df[col].unique()))}
    _encoders[col] = mapping
    X_df[col] = X_df[col].map(mapping).astype(float)

In [11]:
X: np.ndarray = X_df.to_numpy(dtype=float)

In [15]:
Y: np.ndarray = universe_df["avg_set_size"].astype(np.float64).to_numpy()
assert np.issubdtype(Y.dtype, np.floating), "Y is not float – check dtype conversion"


In [None]:
#Y: np.ndarray = (
#    pd.to_numeric(universe_df["avg_set_size"], errors="coerce")
#    .astype(float)
#    .to_numpy()
#)

In [16]:
# Quick sanity checks
print(f"X dtype: {X.dtype}, shape: {X.shape}")
print(f"Y dtype: {Y.dtype}, min: {Y.min():.3f}, max: {Y.max():.3f}")

X dtype: float64, shape: (6, 2)
Y dtype: float64, min: 1.123, max: 1.144


In [18]:
fanova = fANOVA(X, Y, cs)



fANOVA chokes if the underlying random-forest ends up with zero-variance trees—common when you only have a handful of universes (six rows) and the target varies little

In [22]:
print("\n=== Main‑effect importance (average prediction‑set size) ===")
for i, hp in enumerate(hp_order):
    imp = fanova.quantify_importance([i])["individual importance"]
    print(f"{hp:20s}: {imp:.4f}")

print("\n=== Pairwise interaction importance ===")
for i in range(len(hp_order)):
    for j in range(i + 1, len(hp_order)):
        imp = fanova.quantify_importance([i, j])["total importance"]
        if imp > 0:  # skip zero‑variance interactions for brevity
            print(f"{hp_order[i]} × {hp_order[j]:15s}: {imp:.4f}")


=== Main‑effect importance (average prediction‑set size) ===


IndexError: list index out of range

In [None]:
if OUTPUT_DIR is not None:
    print("Generating visualisation files …")
    Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
    _ = Visualizer(fanova, cs, str(OUTPUT_DIR))
    print(f"Graphviz output written to '{OUTPUT_DIR}'.")
