# Shap variance penalty

[![open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ing-bank/probatus/blob/master/docs/tutorials/nb_shap_variance_penalty_and_results_comparison.ipynb)

When ShapRFECV is computing feature importance and subsequently eliminating features, it computes the average of shap values to get an estimate of that feature's overall importance. In some situations, the variance of these shap values might be high - which might indicate a lack of agreement regarding that feature's importance. Catering to this situation, probatus allows you to penalize features that have a higher variance of shap values.

By setting `shap_variance_penalty_factor` param within `fit_compute()` method, the averaging of shap values is computed by:
<<add>>

See example below:

In [1]:
%%capture
!pip install probatus
!pip install catboost

In [2]:
from probatus.feature_elimination import ShapRFECV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd

In [3]:
X, y = make_classification(n_samples=500, n_informative=20, n_features=100)
model = CatBoostClassifier(n_estimators=100, verbose=0)
shap_elimination = ShapRFECV(model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1)
report_with_penalty = shap_elimination.fit_compute(X, y, shap_variance_penalty_factor=1.0)
report_without_penalty = shap_elimination.fit_compute(X, y, shap_variance_penalty_factor=0)

In [4]:
report_with_penalty

Unnamed: 0,num_features,features_set,eliminated_features,train_metric_mean,train_metric_std,val_metric_mean,val_metric_std
1,100,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[48, 23, 25, 17, 41, 79, 70, 67, 96, 95, 52, 5...",1.0,0.0,0.783734,0.036136
2,80,"[0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14...","[29, 44, 31, 80, 34, 42, 60, 87, 77, 75, 64, 7...",1.0,0.0,0.818636,0.027409
3,64,"[0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 1...","[63, 59, 83, 12, 38, 90, 93, 16, 49, 94, 8, 1]",1.0,0.0,0.809475,0.040263
4,52,"[0, 2, 3, 4, 5, 9, 10, 11, 13, 14, 15, 18, 19,...","[62, 14, 36, 18, 3, 4, 24, 74, 82, 89]",1.0,0.0,0.825634,0.027513
5,42,"[0, 2, 5, 9, 10, 11, 13, 15, 19, 20, 21, 22, 3...","[2, 66, 68, 39, 71, 72, 22, 99]",1.0,0.0,0.858765,0.031187
6,34,"[0, 5, 9, 10, 11, 13, 15, 19, 20, 21, 30, 35, ...","[19, 69, 53, 30, 37, 15]",1.0,0.0,0.845318,0.034718
7,28,"[0, 5, 9, 10, 11, 13, 20, 21, 35, 40, 43, 45, ...","[21, 43, 98, 57, 0]",1.0,0.0,0.847304,0.02902
8,23,"[5, 9, 10, 11, 13, 20, 35, 40, 45, 46, 47, 51,...","[45, 20, 84, 88]",1.0,0.0,0.863716,0.027382
9,19,"[5, 9, 10, 11, 13, 35, 40, 46, 47, 51, 54, 58,...","[13, 76, 92]",0.972956,0.005839,0.8151,0.035161
10,16,"[5, 9, 10, 11, 35, 40, 46, 47, 51, 54, 58, 65,...","[47, 51, 35]",0.969608,0.003283,0.823234,0.055277


In [5]:
report_without_penalty

Unnamed: 0,num_features,features_set,eliminated_features,train_metric_mean,train_metric_std,val_metric_mean,val_metric_std
1,100,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[48, 23, 25, 17, 41, 79, 70, 67, 96, 95, 52, 5...",1.0,0.0,0.783734,0.036136
2,80,"[0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14...","[29, 44, 31, 80, 34, 42, 60, 87, 77, 75, 64, 7...",1.0,0.0,0.818636,0.027409
3,64,"[0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 1...","[63, 59, 83, 12, 38, 90, 93, 16, 49, 94, 8, 1]",1.0,0.0,0.809475,0.040263
4,52,"[0, 2, 3, 4, 5, 9, 10, 11, 13, 14, 15, 18, 19,...","[62, 14, 36, 18, 3, 4, 24, 74, 82, 89]",1.0,0.0,0.825634,0.027513
5,42,"[0, 2, 5, 9, 10, 11, 13, 15, 19, 20, 21, 22, 3...","[2, 66, 68, 39, 71, 72, 22, 99]",1.0,0.0,0.858765,0.031187
6,34,"[0, 5, 9, 10, 11, 13, 15, 19, 20, 21, 30, 35, ...","[19, 69, 53, 30, 37, 15]",1.0,0.0,0.845318,0.034718
7,28,"[0, 5, 9, 10, 11, 13, 20, 21, 35, 40, 43, 45, ...","[21, 43, 98, 57, 0]",1.0,0.0,0.847304,0.02902
8,23,"[5, 9, 10, 11, 13, 20, 35, 40, 45, 46, 47, 51,...","[45, 20, 84, 88]",1.0,0.0,0.863716,0.027382
9,19,"[5, 9, 10, 11, 13, 35, 40, 46, 47, 51, 54, 58,...","[13, 76, 92]",0.972956,0.005839,0.8151,0.035161
10,16,"[5, 9, 10, 11, 35, 40, 46, 47, 51, 54, 58, 65,...","[47, 51, 35]",0.969608,0.003283,0.823234,0.055277


# Which approach is better?

Let's compare a few different configurations of RFECV.

In [6]:
# Compare A: shap_variance_penalty_factor=0.5 & approximate=True
# vs B: shap_variance_penalty_factor=0 (disabled) & approximate=True
num_simulations = 5
results = []


def get_best_idx(shap_report):
    shap_report["eval_metric"] = shap_report["val_metric_mean"]
    best_iteration_idx = shap_report["eval_metric"].argmax()

    return best_iteration_idx


for i in range(num_simulations):
    # Params
    n_samples = np.random.randint(100, 500)
    n_features = 200
    n_informative = np.random.randint(10, 200)
    test_size = np.random.uniform(0.05, 0.5)

    # Create data
    X, y = make_classification(n_samples=n_samples, n_informative=n_informative, n_features=n_features)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Model
    model = CatBoostClassifier(n_estimators=100, verbose=0)

    # Best score from ShapRFECV WITHOUT penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_a = shap_elimination.fit_compute(
        X_train, y_train, shap_variance_penalty_factor=0, approximate=True, check_additivity=False
    )
    best_idx_a = get_best_idx(report_a)
    best_score_a = report_a["val_metric_mean"].iloc[best_idx_a]
    std_a = report_a["val_metric_std"].iloc[best_idx_a]
    num_features_a = report_a["num_features"].iloc[best_idx_a]

    # Best score from ShapRFECV WITH penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_b = shap_elimination.fit_compute(
        X_train, y_train, shap_variance_penalty_factor=0.5, approximate=True, check_additivity=False
    )
    best_idx_b = get_best_idx(report_b)
    best_score_b = report_b["val_metric_mean"].iloc[best_idx_b]
    std_b = report_b["val_metric_std"].iloc[best_idx_b]
    num_features_b = report_b["num_features"].iloc[best_idx_b]

    results.append(
        [best_score_a, std_a, num_features_a, best_score_b, std_b, num_features_b, n_samples, n_features, n_informative]
    )

    results_df = pd.DataFrame(
        results,
        columns=[
            "best_score_a",
            "std_a",
            "num_features_a",
            "best_score_b",
            "std_b",
            "num_features_b",
            "n_samples",
            "n_features",
            "n_informative",
        ],
    )

# Show results
results_df

Unnamed: 0,best_score_a,std_a,num_features_a,best_score_b,std_b,num_features_b,n_samples,n_features,n_informative
0,0.892217,0.028543,8,0.894579,0.013364,20,188,200,13
1,0.797291,0.0362,24,0.801302,0.040278,36,404,200,141
2,0.741265,0.02134,29,0.701914,0.028287,54,499,200,180
3,0.786747,0.087275,8,0.787802,0.065435,24,179,200,183
4,0.834444,0.067287,7,0.770303,0.091079,5,176,200,198


In [7]:
# Compare A: shap_variance_penalty_factor=0.5 & approximate=False
# vs B: shap_variance_penalty_factor=0 (disabled) & approximate=False
num_simulations = 5
results = []

for i in range(num_simulations):
    # Params
    n_samples = np.random.randint(100, 500)
    n_features = 200
    n_informative = np.random.randint(10, 200)
    test_size = np.random.uniform(0.05, 0.5)

    # Create data
    X, y = make_classification(n_samples=n_samples, n_informative=n_informative, n_features=n_features)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Model
    model = CatBoostClassifier(n_estimators=100, verbose=0)

    # Best score from ShapRFECV WITHOUT penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_a = shap_elimination.fit_compute(X_train, y_train, shap_variance_penalty_factor=0, approximate=False)
    best_idx_a = get_best_idx(report_a)
    best_score_a = report_a["val_metric_mean"].iloc[best_idx_a]
    std_a = report_a["val_metric_std"].iloc[best_idx_a]
    num_features_a = report_a["num_features"].iloc[best_idx_a]

    # Best score from ShapRFECV WITH penalization
    shap_elimination = ShapRFECV(
        model=model, step=0.2, min_features_to_select=5, cv=5, scoring="f1", n_jobs=5, verbose=1
    )
    report_b = shap_elimination.fit_compute(X_train, y_train, shap_variance_penalty_factor=0.5, approximate=False)
    best_idx_b = get_best_idx(report_b)
    best_score_b = report_b["val_metric_mean"].iloc[best_idx_b]
    std_b = report_b["val_metric_std"].iloc[best_idx_b]
    num_features_b = report_b["num_features"].iloc[best_idx_b]

    results.append(
        [best_score_a, std_a, num_features_a, best_score_b, std_b, num_features_b, n_samples, n_features, n_informative]
    )

    results_df = pd.DataFrame(
        results,
        columns=[
            "best_score_a",
            "std_a",
            "num_features_a",
            "best_score_b",
            "std_b",
            "num_features_b",
            "n_samples",
            "n_features",
            "n_informative",
        ],
    )

# Show results
results_df

Unnamed: 0,best_score_a,std_a,num_features_a,best_score_b,std_b,num_features_b,n_samples,n_features,n_informative
0,0.742664,0.091943,11,0.773073,0.095611,13,250,200,43
1,0.829656,0.052365,20,0.798127,0.053808,29,327,200,24
2,0.724558,0.043146,83,0.746103,0.022388,20,394,200,179
3,0.822537,0.044845,36,0.825153,0.038366,29,479,200,60
4,0.729214,0.038897,83,0.731563,0.024997,54,485,200,176
