# Shap tests

Followed along  (https://www.youtube.com/@DeepFindr) on youtube


### Imports

In [1]:
from utils import DataLoader
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
import shap

  from .autonotebook import tqdm as notebook_tqdm


### Load and preprocess data

In [2]:
data_loader = DataLoader()
data_loader.load_dataset()
data_loader.preprocess_data()

### Split data

In [3]:
X_train, X_test, y_train, y_test = data_loader.get_data_split()

### Oversample train data

In [4]:
X_train, y_train = data_loader.oversample(X_train, y_train)
print(X_train.shape)
print(X_test.shape)

(7778, 21)
(1022, 21)


### Fit blackbox model

In [5]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

F1 Score 0.5342599524755053
Accuracy 0.9452054794520548


### Create SHAP explainer

In [6]:
explainer = shap.TreeExplainer(rf)

### Calculate shapley values for test data

In [7]:
start_index = 1
end_index = 2
shap_values = explainer.shap_values(X_test[start_index:end_index])
X_test[start_index:end_index]

Unnamed: 0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,age,hypertension,heart_disease,avg_glucose_level,bmi
2813,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,43.0,0,0,91.13,33.9


### Investigating the values

In [8]:
# class 0 = contribution to class 1
# class 1 = contribution to class 2
print(shap_values[0].shape)
shap_values

(1, 21)


[array([[ 7.29032123e-03,  8.07188916e-03, -2.90019726e-07,
         -3.59995365e-03, -5.39507485e-03, -8.18911439e-04,
         -2.66569450e-05,  3.08882739e-03,  1.01352088e-02,
         -2.66072876e-03,  7.70575458e-03,  8.21656277e-03,
         -1.99545149e-03,  1.58069089e-02,  1.64175146e-02,
          4.87723216e-03,  2.44191001e-01,  1.31824065e-02,
          1.00504507e-02,  9.00531709e-02,  7.51565400e-02]]),
 array([[-7.29032123e-03, -8.07188916e-03,  2.90019726e-07,
          3.59995365e-03,  5.39507485e-03,  8.18911439e-04,
          2.66569450e-05, -3.08882739e-03, -1.01352088e-02,
          2.66072876e-03, -7.70575458e-03, -8.21656277e-03,
          1.99545149e-03, -1.58069089e-02, -1.64175146e-02,
         -4.87723216e-03, -2.44191001e-01, -1.31824065e-02,
         -1.00504507e-02, -9.00531709e-02, -7.51565400e-02]])]

### Visualizing local predictions

In [9]:
shap.initjs()

### Force plot

In [None]:
prediction = rf.predict(X_test[start_index:end_index])[0]
print(f"The RF predicted: {prediction}")
shap.force_plot(explainer.expected_value[1], shap_values[1], X_test[start_index:end_index])

### Visualize global features

In [None]:
shap.summary_plot(shap_values, X_test)