In [10]:
import shap
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from xgboost import XGBClassifier

# Generate synthetic data
np.random.seed(42)
data = {
    'Feature1': np.random.normal(loc=0, scale=1, size=100),
    'Feature2': np.random.normal(loc=2, scale=1.5, size=100),
    'Feature3': np.random.uniform(low=-1, high=1, size=100)
}

# Create a DataFrame
df = pd.DataFrame(data)

# Generate a target variable with some dependency
df['Target'] = df['Feature1'] * 0.5 + df['Feature2'] * 1.5 + np.random.normal(loc=0, scale=1, size=100)

X = df.drop('Target', axis=1)
y = df['Target']

# transform the target to category by binning
y = pd.qcut(y, q=2, labels=False)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = XGBClassifier().fit(X_train, y_train)

# Create the explainer and SHAP values
explainer = shap.Explainer(model)
shap_values = explainer(X_train)

# Calculate mean absolute SHAP values for each feature
shap_sum = np.abs(shap_values.values).mean(axis=0)
importance_df = pd.DataFrame([X_train.columns.tolist(), shap_sum.tolist()]).T
importance_df.columns = ['feature', 'shap_importance']

# Sort features by importance
importance_df = importance_df.sort_values('shap_importance', ascending=False)

# Select top features (here there are only 3)
selected_features = importance_df.head(10)['feature'].tolist()

# show the results
print(importance_df)
print(selected_features)

    feature shap_importance
1  Feature2        2.975537
0  Feature1        0.516939
2  Feature3        0.379312
['Feature2', 'Feature1', 'Feature3']
