In [9]:
import numpy as np
import pandas as pd
import shap
import sklearn
#Classification
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
#Regression
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

In [10]:
#binary outcome
def cal_abs_shap_dict(X, y, index, model_name):
    if model_name == "catb":
        model = CatBoostClassifier(random_state=2025, verbose=False)
        model.fit(X, y)
        explainer = shap.TreeExplainer(model)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "ligb":
        model = LGBMClassifier(random_state=2025)
        model.fit(X, y)
        explainer = shap.TreeExplainer(model)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "xgb":
        model = XGBClassifier(random_state=2025, eval_metric='logloss')
        model.fit(X, y)
        explainer = shap.TreeExplainer(model)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "gbdt":
        model = GradientBoostingClassifier(random_state=2025)
        model.fit(X, y)
        explainer = shap.Explainer(model)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "rf":
        model = RandomForestClassifier(random_state=2025)
        model.fit(X, y)
        explainer = shap.TreeExplainer(model)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values[:,0]
    elif model_name == "logistic":
        model = LogisticRegression(random_state=2025)
        model.fit(X, y)
        explainer = shap.Explainer(model.predict_proba, X)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values[:,0]
    elif model_name == "svm":
        model = SVC(probability=True, random_state=2025)
        model.fit(X, y)
        explainer = shap.Explainer(model.predict, X)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "nn":
        model = MLPClassifier(random_state=2025)
        model.fit(X, y)
        explainer = shap.Explainer(model.predict_proba, X)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values[:,0]
    elif model_name == "knn":
        model = KNeighborsClassifier()
        model.fit(X, y)
        explainer = shap.Explainer(model.predict_proba, X)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values[:,0]
    else:
        raise ValueError(f"Unrecognized model name: {model_name}. Please check your input.")
    return importance

In [11]:
#continuous outcome
def cal_abs_shap_quan(X, y, index, model_name):
    if model_name == "catb":
        model = CatBoostRegressor(random_state=2025, verbose=False)
        model.fit(X, y)
        explainer = shap.TreeExplainer(model)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "ligb":
        model = LGBMRegressor(random_state=2025)
        model.fit(X, y)
        explainer = shap.TreeExplainer(model)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "xgb":
        model = XGBRegressor(random_state=2025, eval_metric='rmse')
        model.fit(X, y)
        explainer = shap.TreeExplainer(model)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "gbdt":
        model = GradientBoostingRegressor(random_state=2025)
        model.fit(X, y)
        explainer = shap.TreeExplainer(model)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "rf":
        model = RandomForestRegressor(random_state=2025)
        model.fit(X, y)
        explainer = shap.TreeExplainer(model)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "linear":
        model = LinearRegression()
        model.fit(X, y)
        explainer = shap.Explainer(model.predict, X)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "svm":
        model = SVR()
        model.fit(X, y)
        explainer = shap.Explainer(model.predict, X)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "nn":
        model = MLPRegressor(random_state=2025)
        model.fit(X, y)
        explainer = shap.Explainer(model.predict, X)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    elif model_name == "knn":
        model = KNeighborsRegressor()
        model.fit(X, y)
        explainer = shap.Explainer(model.predict, X)
        X_background = X.iloc[index]
        shap_values = explainer(X_background)
        importance = shap_values.abs.mean(axis=0).values
    else:
        raise ValueError(f"Unrecognized model name: {model_name}. Please check your input.")
    
    return importance

In [12]:
def calculate_fi(X, y, index, models, binary=True, M=5, kopath="", outpath=""):

    ncol = X.shape[1]
    y=y.to_numpy().ravel()
    index = index[0].astype(int).tolist()

    if binary == True:
        for model_name in models:
            im = pd.DataFrame(np.zeros((M + 1, ncol)))
            im.iloc[0] = cal_abs_shap_dict(X, y, index,model_name=model_name)

            print(f"{model_name}: Original, Done!")

            for i in range(M):
                file = f'{kopath}/X_k{i + 1}.csv'
                data = pd.read_csv(file)
                im.iloc[i+1] = cal_abs_shap_dict(data,y,index,model_name=model_name)
                print(f"{model_name}: Index: {i + 1}, Done!")

            file_path = f"{outpath}/{model_name}_fi.csv"
            im.to_csv(file_path, index=False)
            print(f"{model_name}: Write FI, Done!")
    else:
        for model_name in models:
            im = pd.DataFrame(np.zeros((M + 1, ncol)))
            im.iloc[0] = cal_abs_shap_quan(X, y, index,model_name=model_name)

            print(f"{model_name}: Original, Done!")

            for i in range(M):
                file = f'{kopath}/X_k{i + 1}.csv'
                data = pd.read_csv(file)
                im.iloc[i+1] = cal_abs_shap_quan(data,y,index,model_name=model_name)
                print(f"{model_name}: Index: {i + 1}, Done!")

            file_path = f"{outpath}/{model_name}_fi.csv"
            im.to_csv(file_path, index=False)
            print(f"{model_name}: Write FI, Done!")
    

In [None]:
X = pd.read_csv('/Data/X.csv')
y = pd.read_csv('/Data/y_dict.csv')
index = pd.read_csv('/Data/Index.csv', header=None)
calculate_fi(X, y, index, models=['catb', 'ligb', 'xgb', 'gbdt', 'rf'], binary=True, M=5, 
             kopath="/Data", 
             outpath="/Data")

In [None]:
X = pd.read_csv('/Data/X.csv')
y = pd.read_csv('/Data/y_quan.csv')
index = pd.read_csv('/Data/Index.csv', header=None)
calculate_fi(X, y, index, models=['catb', 'ligb', 'xgb', 'gbdt', 'rf'], binary=False, M=5, 
             kopath="/Data", 
             outpath="/Data")