- https://betterdatascience.com/feature-importance-python/
- https://towardsdatascience.com/understanding-feature-importance-and-how-to-implement-it-in-python-ff0287b20285
- 

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier


- RestingBP 0 as Na's
- Cholesterol 0 as Na's
  

In [None]:
# Download data from Kaggle and change me:
# https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction
DATA_PATH = "./data/heart.csv"
TEST_SIZE = 0.2
VALID_SIZE = 0.25
RANDOM_STATE = 42
NUMERIC_TRANSFORMER_REPLACEMENT = "median"

In [None]:
dtype_dic = {
    'Sex':'category',
    'ChestPainType': 'category',
    'FastingBS': 'bool',
    'RestingECG': 'category',
    'ExerciseAngina': 'category',
    'ST_Slope': 'category',
    'HeartDisease': 'bool'
}
df = pd.read_csv(filepath_or_buffer=DATA_PATH, delimiter=',', dtype=dtype_dic)
df.ExerciseAngina = df.ExerciseAngina == 'Y'

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
y = df["HeartDisease"]
X = df.drop(columns=["HeartDisease"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=VALID_SIZE, random_state=RANDOM_STATE
)  # 0.25 x 0.8 = 0.2

In [None]:
numeric_features = X_train.select_dtypes('number').columns
categorical_features = X_train.select_dtypes('category').columns.to_list() + X_train.select_dtypes('bool').columns.to_list()
missing_values_ind = 0


numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(missing_values=missing_values_ind, strategy=NUMERIC_TRANSFORMER_REPLACEMENT)),
        ("scaler", StandardScaler()),
    ]
)


# fmt: off
one_hot_enc = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("onehot", one_hot_enc, categorical_features),
    ]
)
preprocessor 
# fmt: on

In [None]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", xgb.XGBClassifier(objective="binary:logistic", random_state=RANDOM_STATE))]
)

In [None]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_val, y_val))
print("model score: %.3f" % clf.score(X_test, y_test))

____


In [None]:
clf.named_steps['preprocessor'].get_feature_names_out()


In [None]:
mod = clf['classifier']
feature_important = mod.get_booster().get_score(importance_type='weight')

keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.nlargest(40, columns="score").plot(kind='barh', figsize = (20,10))

In [None]:
importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': model.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False)

plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
list(clf.named_steps['preprocessor'].get_feature_names_out())

In [None]:
(clf.named_steps['preprocessor'].get_feature_names_out())

In [None]:
len((clf.named_steps['preprocessor'].get_feature_names_out()))

In [None]:
len(list(feature_important.values()))

In [None]:
xgb.plot_importance(mod.get_booster())



In [None]:
(mod.feature_importances_)

In [None]:
xgb.XGBClassifier()