# 概述
计算测试的每个样本的每个特征对两类预测结果的shap值。

得到特征重要图、shap值图、单个样本所有特征的shap值图、某一特征不同值的shap值图。

绘制单一样本的决策瀑布图、决策图、所有样本的决策图、被分类错误样本的决策图、特征两两关系决策图、单一样本某一特征变化的决策图、找到受某一特征影响最大的样本

## 导入工具包，导入数据集，训练模型并做预测

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("./data/process_heart.csv")

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

model = RandomForestClassifier(max_depth=5, n_estimators=100)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

## 导入shap机器学习可解释性分析工具包

In [None]:
import shap
shap.initjs()

## 计算测试集每个样本的每个特征对两类预测结果的shap值

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
len(shap_values)

### 样本中，不患病(target=0)的预测结束的shap值

In [None]:
shap_values[0].shape

### 样本中，患病(target=1)的预测结束的shap值

In [None]:
shap_values[1].shape

### 测试集所有样本，预测“不患病”与“患病”各自的平均概率

In [None]:
expected_value = explainer.expected_value
expected_value

In [None]:
explainer.expected_value[0] + explainer.expected_value[1]

### 对某个样本，模型预测为“患病”的概率即为 测试集“患病”的平均概率与该样本各特征对“患病”预测结果的shap值之和

In [None]:
shap_values[1].sum(axis=1).shape

In [None]:
y_pred_proba_shap = shap_values[1].sum(axis=1) + explainer.expected_value[1]

In [None]:
y_pred_proba_shap.shape

y_pred_proba_shap == y_pred_proba[:, 1]

In [None]:
y_pred_proba_shap

In [None]:
y_pred_proba[:, 1]

## 特征重要值

In [None]:
shap.summary_plot(shap_values[1], X_test, plot_type="bar")

## 各特征的数值大小 与 各特征的shap值 关系图
每行代表一个特征，红色表示该特征的值较高的数据点，蓝色表示较低的数据点；越靠右表示对该特征对预测为“患病”的正向影响较高

In [None]:
shap.summary_plot(shap_values[1], X_test)

In [None]:
shap.summary_plot(shap_values[1], X_test, plot_type="violin")

## 各特征两两特征对shap值的关系（对角线的图代表单一特征数值大小与该特征的shap值的关系图）
红色代表两特征都是高值，蓝色代表两特征都是低值

In [None]:
shap_interaction_values = explainer.shap_interaction_values(X_test)
shap.summary_plot(shap_interaction_values[1], X_test)

## 分析单个样本各特征对预测为“患病”结果的影响

In [None]:
X_test.head()

In [None]:
idx = 126
patient = X.iloc[idx, :]
patient

In [None]:
# idx is 126 in X_test, offset is 4 in shap_interaction_values (idx in shap_interaction_values is 4-1)
shap.summary_plot(shap_interaction_values[1][3], X_test, plot_type="bar")

In [None]:
shap_values_patient = explainer.shap_values(patient)
shap_values_patient

In [None]:
# base value - model output value (0.5704 - 0.80)
shap_values_patient[0].sum()

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values_patient[1], patient)

In [None]:
shap.waterfall_plot(explainer.expected_value[1], shap_values_patient[1], patient)

## 测试集所有样本的summary plot
将测试集所有样本的force plot旋转九十度并拼在一起，形成summary plot

可以在下拉菜单选择按照相似性聚类展示、按照预测结果概率从大到小展示、按照测试集原本样本顺序、按照某个特征分别展示

In [None]:
number_show = 60
shap_values_summary = explainer.shap_values(X_train.iloc[:number_show])
shap.force_plot(explainer.expected_value[1], shap_values_summary[1], X_test.iloc[:number_show])

### Dependent Plot
展示某个特征从小变大时对预测结果的shap值

In [None]:
shap.dependence_plot("num_major_vessels", shap_values[1], X_test, interaction_index=None)

In [None]:
shap.dependence_plot("max_heart_rate_achieved", shap_values[1], X_test, interaction_index=None)

In [None]:
shap.dependence_plot("max_heart_rate_achieved", shap_values[1], X_test, interaction_index="sex_male")

### Partial Dependent Plot
展示某个特征从小变大时模型预测结果

In [None]:
shap.partial_dependence_plot("max_heart_rate_achieved", model.predict, X_test, model_expected_value=True, feature_expected_value=True)

In [None]:
shap.partial_dependence_plot("num_major_vessels", model.predict, X_test, model_expected_value=True, feature_expected_value=True)

## 决策图：Decision Plot
### 瀑布图只能展示单个数据的决策过程，决策图可以展示测试集所有数据的决策过程

In [None]:
shap.decision_plot(expected_value[1], shap_values[1], X_test)

### 查看典型决策路径与异常点
feature_order="hclust"

In [None]:
shap.decision_plot(expected_value[1], shap_values[1], X_test, feature_order="hclust")

加link="logit"参数，进行对数几率缩放转换

In [None]:
shap.decision_plot(expected_value[1], shap_values[1], X_test, link="logit")

## 自定义决策图特征显示顺序

In [None]:
feature_idx = [i for i in range(26)]

idx = 25
selection = np.zeros((61))
selection[idx] = 1
selection = selection > 0

print("索引号为%d的样本，在原始数据集X中的索引号为%d" % (idx, X_test.iloc[idx:idx+1].index[0]))
shap.decision_plot(expected_value[1], shap_values[1][selection], X_test[selection], feature_order=feature_idx)

## 选出测试集中模型预测错误的样本

In [None]:
misclassified = y_pred != y_test
misclassified_df = pd.DataFrame({"misclassified": misclassified})
misclassified_df = misclassified_df[misclassified_df["misclassified"] == True]
misclassified_df

In [None]:
idx = 194
patient = X.iloc[idx, :]
patient_df = X.loc[idx:idx]
model_predict_proba = model.predict_proba(patient_df)[0][1]
print("%d号病人的真实标签为%s，模型预测可能性为%.2f" % (idx, bool(y_test[idx]), model_predict_proba))

shap_values_patient = explainer.shap_values(patient)
shap.force_plot(explainer.expected_value[1], shap_values_patient[1], patient)

In [None]:
idx = 139
patient = X.iloc[idx, :]
patient_df = X.loc[idx:idx]
model_predict_proba = model.predict_proba(patient_df)[0][1]
print("%d号病人的真实标签为%s，模型预测可能性为%.2f" % (idx, bool(y_test[idx]), model_predict_proba))

shap_values_patient = explainer.shap_values(patient)
shap.force_plot(explainer.expected_value[1], shap_values_patient[1], patient)

## 在决策图中显示测试集中模型预测错误的样本

In [None]:
shap.decision_plot(expected_value[1], shap_values[1], X_test, highlight=misclassified)

In [None]:
shap.decision_plot(expected_value[1], shap_values[1][misclassified], X_test[misclassified], highlight=range(len(misclassified_df)), feature_order="hclust")

## 两两交互特征对预测结果影响
主对角线的图与summary plot相同。

选取索引值为5的样本的变量交互shap值矩阵

In [None]:
shap_interaction_values[1][5].shape

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 10))
sns.heatmap(shap_interaction_values[1][5], annot=True, fmt=".1f", square=True)
plt.show()

### 考虑两两交互特征的决策图

In [None]:
shap.decision_plot(expected_value[1], shap_interaction_values[1], X_test, highlight=misclassified)

In [None]:
# slice(None, None, -1) 全部
# slice(None, -101, -1) 100个
shap.decision_plot(expected_value[1], shap_interaction_values[1], X_test, highlight=misclassified, feature_display_range=slice(None, None, -1), ignore_warnings=True)

### 考虑到两两交互特征的单个样本决策图

In [None]:
idx = 24
selection = np.zeros((61))
selection[idx] = 1
selection = selection > 0

print("索引号为%d的样本，在原始数据集X中的索引号为%d" % (idx, X_test.iloc[idx:idx+1].index[0]))
shap.decision_plot(expected_value[1], shap_values[1][selection], X_test[selection])

### 25号病人某一特征变化对模型分类结果的影响

In [None]:
idx = 25
X_test.loc[idx]
print("索引号为%d的样本，在原始数据集X中的索引号为%d" % (idx, X_test.iloc[idx:idx+1].index[0]))

feature_selected = "max_heart_rate_achieved"
sep = 200
feature_selected_min = X[feature_selected].min()
feature_selected_max = X[feature_selected].max()
step = (feature_selected_max - feature_selected_min ) / sep
rg = np.arange(feature_selected_min, feature_selected_max, step)
R = X.iloc[np.repeat(idx, len(rg))].reset_index(drop=True)
R[feature_selected] = rg
hypothetical_shap_values = explainer.shap_values(R)[1]
shap.dependence_plot(feature_selected, hypothetical_shap_values, R, interaction_index=None)

### 绘制25号病人某一特征不同变化范围的决策图

In [None]:
shap.decision_plot(expected_value[1], hypothetical_shap_values[[0, 50, 99]], X_test.iloc[idx], feature_order="importance")