In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import ParameterGrid

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import Ridge, LogisticRegression, LinearRegression, Lasso, ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, fbeta_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import shap

import warnings
warnings.filterwarnings("ignore")

# 0. functions used

In [None]:
# split the dataset into Other set and Test set
def basic_split(X, y, other_size, test_size, random_state):
    # test the inputs
    if (other_size+test_size != 1):
        print("The split size does not add up to 1")
        raise ValueError
    if not isinstance(random_state, int):
        print("The random state entered in not an integer")
        raise ValueError
    if X.shape[0] != y.shape[0]:
        print("The number of rows in X is not the same as the length of y")
        raise ValueError
    if len(X.shape) != 2:
        print("Input X is not a 2-D pandas dataframe")
        raise ValueError
    if len(y.shape) != 1:
        print("Input y is not a 1-D pandas series")
        raise ValueError

    # split the dataset into other and test
    X_other, X_test, y_other, y_test = train_test_split(X, y,
                                                          train_size = other_size, 
                                                          random_state = random_state)
  
    return X_other, y_other, X_test, y_test

# function for the ML pipeline as outlined above 
def MLpipe_KFold_accuracy(X, y, preprocessor, ML_algo, param_grid):

    # lists to be returned 
    test_scores = []
    best_models = []
    model = pd.DataFrame(columns=['best_parameter', 'train_score', 'test_score'])

    for i in range(5):
        print(f'---------Random State = {42*i}---------')
        # split the data
        X_other, X_test, y_other, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42*i)

        # use KFold with 4 folds
        kf = KFold(n_splits=4,shuffle=True,random_state=42*i)

        pipe = make_pipeline(preprocessor,ML_algo)

        # GridSearchCV, loop through all possible parameters
        # preprocess data and perform cross valifation
        grid = GridSearchCV(pipe, param_grid=param_grid,scoring='accuracy',
                        cv=kf, return_train_score = True, n_jobs=-1, verbose=3) # return_train_score = True, n_jobs=-1, verbose=True
        
        grid.fit(X_other, y_other)
        results = pd.DataFrame(grid.cv_results_)

        # calculate and save the test score
        final_model = grid.best_estimator_

        y_train_pred = final_model.predict(X_other)
        train_accuracy= accuracy_score(y_other,y_train_pred)

        y_test_pred = final_model.predict(X_test)
        test_scores.append(accuracy_score(y_test,y_test_pred)) # calculate the rmse
        best_models.append(grid.best_params_) # save the best param 

        model.loc[len(model)] = [grid.best_params_, train_accuracy, accuracy_score(y_test,y_test_pred)]
    
    print('------------acuuracy scores of each random state------------')
    print(model)

    return test_scores, best_models, results, model

# 1. before modeling

In [None]:
df = pd.read_csv('mushroom/secondary_data.csv', sep=';')
print(df.shape)
df.head()

In [None]:
# convert features
df['does-bruise-or-bleed'] = df['does-bruise-or-bleed'].replace('t', 1)
df['does-bruise-or-bleed'] = df['does-bruise-or-bleed'].replace('f', 0)
df['has-ring'] = df['has-ring'].replace('t', 1)
df['has-ring'] = df['has-ring'].replace('f', 0)

# split the features and target variable
y = df['class']
X = df.drop(columns='class')
print(X.shape, y.shape)

# replace the e with 0, and p with 1
y = y.replace('e', 0)
y = y.replace('p', 1)

# split the dataset
X_other, y_other, X_test, y_test = basic_split(X, y, other_size = 0.8, test_size = 0.2, random_state=42)
print('The shape of each train, val, and test set are the following:')
print('X_other: {a}, X_test: {b}'.format(a=X_other.shape, b=X_test.shape))
print('y_other: {a}, y_test: {b}'.format(a=y_other.shape, b=y_test.shape))

# 2. Define the Preprocessor

In [None]:
# define preprocessors
binary_feature = ['does-bruise-or-bleed', 'has-ring']
onehot_features = ['cap-shape', 'cap-surface', 'cap-color', 'gill-attachment', 'gill-spacing', 'gill-color','stem-root', 
                'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'ring-type', 'spore-print-color', 'habitat', 'season']
std_features = ['cap-diameter','stem-height','stem-width']

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), onehot_features),
        ('std', StandardScaler(), std_features)])


In [None]:
clf = Pipeline(steps=[('preprocessor', preprocessor)]) # for now we only preprocess 
                                                       # later on we will add other steps here
clf.set_output(transform='pandas')

X_other_prep = clf.fit_transform(X_other) # save for later use
X_test_prep = clf.transform(X_test)

print('Shape of X_train before transformation:', X_other.shape)
print('Shape of X_train after transformation:', X_other_prep.shape)
print('Note: the X_train here refers to X_other in code')
X_other_prep.head()


# 3. Baseline Accuracy

In [None]:
# predict all class as poisonous
baseline_pred = np.ones(len(y_test))
baseline_acc = accuracy_score(y_test, baseline_pred)

print(f"Baseline Accuracy: {baseline_acc:.5f}")

# 4. Train the model

In [None]:
# define a dataframe to store the models' results
res_accuracy = pd.DataFrame(columns=['model', 'mean_test_accuracy', 'std_test_accuracy'])

## 4.1 Logistic regression

In [None]:
# Logistic Regression
param_grid = {
    'logisticregression__C': [1/0.001, 1/0.01, 1/0.1, 1/1.0]  # l1
}
ML_algo = LogisticRegression(random_state=42, max_iter=5000)

print("___________Model {}___________".format('Logistic Regression'))
test_scores, best_models, res_lr, model_lr = MLpipe_KFold_accuracy(X, y, preprocessor, ML_algo, param_grid)

# Print the test scores
print("Test Scores:", test_scores)

# Calculate and print the mean and standard deviation of the test scores
mean_accuracy = np.mean(test_scores)
std_accuracy = np.std(test_scores)

res_accuracy.loc[len(res_accuracy)] = ['Logistic Regression', mean_accuracy, std_accuracy]

print("Mean accuracy: {}".format(mean_accuracy))
print("Standard Deviation of accuracy: {}".format(std_accuracy))

In [None]:
model_lr

In [None]:
res_test_score = pd.DataFrame(columns=['Model', 'mean_test', 'std_test'])
res_test_score.loc[len(res_test_score)] = ['Logistic Regression', model_lr['test_score'].mean(), model_lr['test_score'].std()]
res_test_score

## 4.2 Random Forest

In [None]:
# random forest
param_grid = {
    'randomforestclassifier__max_depth': [5, 10, 20],  # RF
    'randomforestclassifier__max_features': [0.25, 0.5, 0.75],
    'randomforestclassifier__n_estimators': [20, 50, 100]
}
ML_algo = RandomForestClassifier(random_state=42, max_features='auto')

print("___________Model {}___________".format('Random Forest'))
test_scores, best_models, res_rf, model_rf= MLpipe_KFold_accuracy(X, y, preprocessor, ML_algo, param_grid)

# Print the test scores
print("Test Scores:", test_scores)

# Calculate and print the mean and standard deviation of the test scores
mean_accuracy = np.mean(test_scores)
std_accuracy = np.std(test_scores)

res_accuracy.loc[len(res_accuracy)] = ['Random Forest', mean_accuracy, std_accuracy]

print("Mean accuracy: {}".format(mean_accuracy))
print("Standard Deviation of accuracy: {}".format(std_accuracy))

In [None]:
model_rf

In [None]:
res_test_score.loc[len(res_test_score)] = ['Random Forest', model_rf['test_score'].mean(), model_rf['test_score'].std()]
res_test_score

## 4.3 Support vector classification

In [None]:
# support vector classification

# function for the ML pipeline as outlined above 
def MLpipe_KFold_accuracy(X, y, preprocessor, ML_algo, param_grid):

    # lists to be returned 
    test_scores = []
    best_models = []
    model_svc = pd.DataFrame(columns=['best_parameter', 'train_score', 'test_score'])

    for i in range(5):
        # split the data
        X_other, X_test, y_other, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42*i)

        # use KFold with 4 folds
        kf = KFold(n_splits=4,shuffle=True,random_state=42*i)

        pipe = make_pipeline(preprocessor,ML_algo)

        # GridSearchCV, loop through all possible parameters
        # preprocess data and perform cross valifation
        grid = GridSearchCV(pipe, param_grid=param_grid,scoring='accuracy',
                        cv=kf, return_train_score = True, n_jobs=-1, verbose=3) # return_train_score = True, n_jobs=-1, verbose=True
        
        grid.fit(X_other, y_other)
        results = pd.DataFrame(grid.cv_results_)

        # calculate and save the test score
        final_model = grid.best_estimator_
        train_accuracy= grid_search.score(X_other, y_other)
        y_test_pred = final_model.predict(X_test)
        test_scores.append(accuracy_score(y_test,y_test_pred)) # calculate the rmse
        best_models.append(grid.best_params_) # save the best param 

        model_svc.loc[len(model_svc)] = [grid.best_params_, train_accuracy, accuracy_score(y_test,y_test_pred)]

    return test_scores, best_models, results, model_svc

param_grid = {
    'svc__gamma': [1e-2, 1e-1, 1e1, 1e3],  # SVR
    'svc__C': [1e-1, 1e0, 1e1, 1e2]
}
ML_algo = SVC(kernel='poly', class_weight='balanced', random_state=42)

print("___________Model {}___________".format('Support Vector Classification'))
test_scores, best_models, res_svc, model_svc = MLpipe_KFold_accuracy(X, y, preprocessor, ML_algo, param_grid)

# Print the test scores
print("Test Scores:", test_scores)

# Calculate and print the mean and standard deviation of the test scores
mean_accuracy = np.mean(test_scores)
std_accuracy = np.std(test_scores)

res_accuracy.loc[len(res_accuracy)] = ['Support Vector Classification', mean_accuracy, std_accuracy]

print("Mean accuracy: {}".format(mean_accuracy))
print("Standard Deviation of accuracy: {}".format(std_accuracy))
print(model_svc)

In [None]:
res_test_score.loc[len(res_test_score)] = ['SVC', np.mean(test_scores), np.std(test_scores)]
res_test_score

## 4.4 KNN

In [None]:
# knn
param_grid = {
    'kneighborsclassifier__n_neighbors': [3,9,12,15,30,50,100]  # knn
}
ML_algo = KNeighborsClassifier()

print("___________Model {}___________".format('knn'))
test_scores, best_models, res_knn, model_knn = MLpipe_KFold_accuracy(X, y, preprocessor, ML_algo, param_grid)

# Print the test scores
print("Test Scores:", test_scores)

# Calculate and print the mean and standard deviation of the test scores
mean_accuracy = np.mean(test_scores)
std_accuracy = np.std(test_scores)

res_accuracy.loc[len(res_accuracy)] = ['KNN', mean_accuracy, std_accuracy]

print("Mean accuracy: {}".format(mean_accuracy))
print("Standard Deviation of accuracy: {}".format(std_accuracy))

In [None]:
model_knn

In [None]:
res_test_score.loc[len(res_test_score)] = ['KNN', model_knn['test_score'].mean(), model_knn['test_score'].std()]
res_test_score

## 4.5 xgboost

In [None]:
# xgboost

# Define hyperparameter grid for tuning
param_grid = {
    'xgbclassifier__max_depth': [3, 5, 7, 10],
    'xgbclassifier__min_child_weight': [1, 3, 5],
    'xgbclassifier__learning_rate': [0.1],
    'xgbclassifier__lambda': [0.01, 0.1, 1],  # reduce overfitting
    'xgbclassifier__alpha': [0.01, 0.1, 1]  # Used for high dimensionality
}
pg = ParameterGrid(param_grid)

# Train the pipeline on five different random states
random_states = [42, 123, 456, 789, 101]
train_scores = []
test_scores = []
best_models = []
test_sets = []

for random_state in random_states:
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # use KFold with 4 folds
    kf = KFold(n_splits=4,shuffle=True,random_state=random_state)

    # Fit the ColumnTransformer on the training data
    X_train_transformed = preprocessor.fit_transform(X_train)

    # Create XGBoost model
    model = XGBClassifier(random_state=random_state)

    # define pipeline
    pipeline = make_pipeline(preprocessor, model)

    # Create GridSearchCV
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='accuracy', cv=kf, return_train_score=True, verbose=3)

    # Fit the model
    grid_search.fit(X_train, y_train, xgbclassifier__eval_metric="logloss", xgbclassifier__early_stopping_rounds=10, 
                    xgbclassifier__eval_set=[(preprocessor.transform(X_test), y_test)], xgbclassifier__verbose=False)

    # Save train and test scores
    train_accuracy= grid_search.score(X_train, y_train)
    test_accuracy = grid_search.score(X_test, y_test)

    # Save test sets
    test_set_df = pd.DataFrame(data=np.column_stack((X_test, y_test)), columns=X.columns.tolist() + ['target_column'])
    test_sets.append(test_set_df)

    train_scores.append(train_accuracy)
    test_scores.append(test_accuracy)
    best_models.append(grid_search.best_estimator_)
    res_xgb = grid_search.cv_results_

    print(f"Random State: {random_state},'Best Model': {grid_search.best_estimator_}, Train Score: {train_accuracy:.4f}, Test Score: {test_accuracy:.4f}")

# Save train and test scores
scores_df = pd.DataFrame({'Random State': random_states, 'Best Model': best_models, 'Train Score': train_scores, 'Test Score': test_scores})


In [None]:
scores_df

In [None]:
res_test_score.loc[len(res_test_score)] = ['XGBoost', scores_df['Test Score'].mean(), scores_df['Test Score'].std()]
res_test_score

## 4.6 model results

In [None]:
result_scores = pd.read_csv('model_output/result_scores.csv')
result_scores

In [None]:
res_test_score

# 5. Results

## 5.1 Comparing with baseline

In [None]:
# predict all class as poisonous
baseline_pred = np.ones(len(y_test))
baseline_acc = accuracy_score(y_test, baseline_pred)

print(f"Baseline Accuracy: {baseline_acc:.5f}")

In [None]:
test_mean = np.array(result_scores['mean_test']) 
test_std = np.array(result_scores['std_test']) 
labels = result_scores['Model']

# Plot the mean values
plt.figure(figsize=(10, 7))
plt.bar(labels, test_mean, yerr=test_std, capsize=5, color='skyblue', alpha=0.7)

plt.axhline(y=baseline_acc, linestyle='--', color='red', label='BaseLine Accuracy = {:.5f}'.format(baseline_acc))

# Add labels and title
plt.legend()
plt.xlabel('Machine Learning Methods')
plt.xticks(rotation=20)
plt.ylabel('Test Accuracy Score')
plt.title('Test Accuracy Score of each Machine Learning Method with Error Bars')

# Add custom labels for error bars (optional)
for i, (x, y, y_err) in enumerate(zip(labels, test_mean, test_std)):
    plt.text(x, y-0.1, f'{y:.5f} $\pm$ {y_err}', ha='center', va='bottom')

# Show the plot
plt.show()

In [None]:
# plot error bar for non-linear models
plt.errorbar(labels[1:], test_mean[1:], yerr=test_std[1:], fmt='o', capsize=5, label='Data with Error Bars')

plt.xlabel('Machine Learning Methods')
plt.ylabel('Test Accuracy Score')
plt.title('Test Accuracy Score of Non-linear Methods with Error Bars')

plt.show()

# 6. Feature Importance

## K Nearest Neighbors

In [None]:
# KNN
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

clf = Pipeline(steps=[('preprocessor', preprocessor)]) # for now we only preprocess 
                                                       # later on we will add train steps here
clf.set_output(transform='pandas')

X_train_prep = clf.fit_transform(X_train) # save for later use
X_test_prep = clf.transform(X_test)
print(X_train_prep.shape)

feature_names = preprocessor.get_feature_names_out()

# choose KNN as the final model
final_model = KNeighborsClassifier(n_neighbors=3)
final_model.fit(X_train_prep, y_train)

In [None]:
y_pred = final_model.predict(X_test_prep)
print(f'test set accuracy score: {final_model.score(X_test_prep, y_test)}')

In [None]:
cm = confusion_matrix(y_test,y_pred)
disp = ConfusionMatrixDisplay(cm,display_labels=['class 0', 'class 1'])
fig, ax = plt.subplots(figsize=(6,4))
plt.title('KNN - n_neighbors=3')
disp.plot(ax=ax)
plt.tight_layout()
plt.show()

## Random Forest

In [None]:
# Random Forest
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

clf = Pipeline(steps=[('preprocessor', preprocessor)]) # for now we only preprocess 
                                                       # later on we will add train steps here
clf.set_output(transform='pandas')

X_train_prep = clf.fit_transform(X_train) # save for later use
X_test_prep = clf.transform(X_test)
print(X_train_prep.shape)

feature_names = preprocessor.get_feature_names_out()

# choose KNN as the final model
rf_model = RandomForestClassifier(max_depth=20, max_features=0.25, n_estimators=20)
rf_model.fit(X_train_prep, y_train)

In [None]:
y_pred =rf_model.predict(X_test_prep)
print(f'test set accuracy score: {rf_model.score(X_test_prep, y_test)}')

In [None]:
cm = confusion_matrix(y_test,y_pred)
disp = ConfusionMatrixDisplay(cm,display_labels=['class 0', 'class 1'])
fig, ax = plt.subplots(figsize=(6,4))
plt.title('Random forest -max_depth=20, max_features=0.25, n_estimators=20')
disp.plot(ax=ax)
plt.tight_layout()
plt.show()

## 6.1 Global feature importance

### 6.1.1 Using KNN

#### Permutation Importance

In [None]:
# your code here
np.random.seed(0)

ftr_names = X_test_prep.columns

nr_runs = 3
scores = np.zeros([len(ftr_names),nr_runs])

test_score = final_model.score(X_test_prep, y_test)
print('test score = ',test_score)
print('test baseline = ', baseline_acc)
# loop through the features
for i in range(len(ftr_names)):
    print('shuffling '+str(ftr_names[i]))
    acc_scores = []
    for j in range(nr_runs):
        X_test_shuffled = X_test_prep.copy()
        X_test_shuffled[ftr_names[i]] = np.random.permutation(X_test_prep[ftr_names[i]].values)
        acc_scores.append(final_model.score(X_test_shuffled,y_test))
    print('   shuffled test score:',np.around(np.mean(acc_scores),3),'+/-',np.around(np.std(acc_scores),3))
    scores[i] = acc_scores

In [None]:
sorted_indcs = np.argsort(np.abs(np.mean(scores,axis=1)-test_score))
plt.rcParams.update({'font.size': 11})
plt.figure(figsize=(10,6))
plt.boxplot(scores[sorted_indcs[-10:]].T,labels=ftr_names[sorted_indcs[-10:]],vert=False)
plt.axvline(test_score,label='test score')
plt.title("Permutation Importances (test set)")
plt.xlabel('score with perturbed feature')
plt.legend()
plt.tight_layout()
plt.show()

#### SHAP

In [None]:
help(shap.kmeans)

In [None]:
shap.initjs() # required for visualizations later on
# create the explainer object with the random forest model
#background_samples = shap.sample(X_train_prep, nsamples=120, random_state=42)
background_samples = shap.kmeans(X_train_prep, k=20)
explainer = shap.KernelExplainer(final_model.predict_proba, data=background_samples)
# transform the test set
X_test_transformed = X_test_prep
print(np.shape(X_test_transformed))
# calculate shap values on the first 1000 points in the test
shap_values_knn = explainer.shap_values(X_test_transformed[:1000])
print(np.shape(shap_values_knn))

In [None]:
(np.sum(np.abs(shap_values_knn[1]),axis=0)+np.sum(np.abs(shap_values_knn[0]),axis=0)).shape

In [None]:
shap_summary = np.sum(np.abs(shap_values_knn[1]),axis=0)+np.sum(np.abs(shap_values_knn[0]),axis=0)
indcs = np.argsort(shap_summary)
shap_summary[indcs]

plt.figure(figsize=(10,6))
plt.barh(feature_names[indcs[-10:]],shap_summary[indcs[-10:]])
plt.title('SHAP value of 10 most important features')
plt.xlabel('mean(|SHAP value|)')
plt.show()

### 6.1.2 Using Random Forest

#### Permutation importance

In [None]:
# your code here
np.random.seed(0)

ftr_names = X_test_prep.columns

nr_runs = 3
scores = np.zeros([len(ftr_names),nr_runs])

test_score = rf_model.score(X_test_prep, y_test)
print('test score = ',test_score)
print('test baseline = ', baseline_acc)
# loop through the features
for i in range(len(ftr_names)):
    print('shuffling '+str(ftr_names[i]))
    acc_scores = []
    for j in range(nr_runs):
        X_test_shuffled = X_test_prep.copy()
        X_test_shuffled[ftr_names[i]] = np.random.permutation(X_test_prep[ftr_names[i]].values)
        acc_scores.append(rf_model.score(X_test_shuffled,y_test))
    print('   shuffled test score:',np.around(np.mean(acc_scores),3),'+/-',np.around(np.std(acc_scores),3))
    scores[i] = acc_scores

In [None]:
sorted_indcs = np.argsort(np.abs(np.mean(scores,axis=1)-test_score))
plt.rcParams.update({'font.size': 11})
plt.figure(figsize=(10,6))
plt.boxplot(scores[sorted_indcs[-10:]].T,labels=ftr_names[sorted_indcs[-10:]],vert=False)
plt.axvline(test_score,label='test score')
plt.title("Permutation Importances (test set)")
plt.xlabel('score with perturbed feature')
plt.legend()
plt.tight_layout()
plt.show()

#### SHAP

In [None]:
shap.initjs() # required for visualizations later on
# create the explainer object with the random forest model
explainer = shap.TreeExplainer(rf_model)
# transform the test set
X_test_transformed = X_test_prep
print(np.shape(X_test_transformed))
# calculate shap values on the first 1000 points in the test
shap_values = explainer.shap_values(X_test_transformed[:1000])
print(np.shape(shap_values))

In [None]:
shap_summary = np.sum(np.abs(shap_values[1]),axis=0)+np.sum(np.abs(shap_values[0]),axis=0)
indcs = np.argsort(shap_summary)
shap_summary[indcs]

plt.figure(figsize=(10,6))
plt.barh(feature_names[indcs[-10:]],shap_summary[indcs[-10:]])
plt.title('SHAP value of 10 most important features')
plt.xlabel('mean(|SHAP value|)')
plt.show()

#### MDI

In [None]:
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)

forest_importances = pd.DataFrame(pd.Series(importances, index=feature_names)).reset_index()
rf_importance = pd.concat([forest_importances,pd.DataFrame(std)], axis=1)
rf_importance.columns = ['feature', 'importance', 'std']
rf_importance = rf_importance.sort_values('importance')
rf_importance = rf_importance.set_index('feature')
rf_importance

In [None]:
fig, ax = plt.subplots(figsize=(10,8))

rf_importance['importance'][-10:].plot.bar(yerr=rf_importance['std'][-10:], ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
plt.xticks(rotation=30)
fig.tight_layout()
plt.show()

## 6.2 Local feature importance

### 6.2.1 Using KNN

#### SHAP

In [None]:
index = 0
shap.force_plot(explainer.expected_value[1], shap_values_knn[1][index,:], features = X_test_transformed.iloc[index,:],feature_names = feature_names)

In [None]:
index = 99
shap.force_plot(explainer.expected_value[1], shap_values_knn[1][index,:], features = X_test_transformed.iloc[index,:],feature_names = feature_names)

#### LIME

In [None]:
from lime import lime_tabular
from lime.lime_text import LimeTextExplainer

instance_to_explain = X_test_prep.iloc[0]

# Replace 'feature_names' with the actual feature names from your dataset
feature = [f"Feature_{i}" for i in range(X_train.shape[1])]

# Create LIME explainer
explainer = lime_tabular.LimeTabularExplainer(X_train_prep.to_numpy(), feature_names=list(feature_names))

# Explain the prediction for the chosen instance
explanation = explainer.explain_instance(instance_to_explain, final_model.predict_proba)


In [None]:
explanation.show_in_notebook()