# 总结
## 数据准备及EDA
```
import pandas as pd
import pandas_profiling as profiling

df = pd.read_csv('./data/raw.csv')
profile = profiling.ProfileReport(df)
profile.to_file("./temp/EDA_Report.html")

```

## 源数据可视化
```
# 接上节数据准备
import seaborn as sns

# 特征之间的相关系数
df.corr()

# 热力图
sns.heatmap(df.corr(), annot=True, fmt='.2f', square=True)
plt.show()

# 特征两两之间相关性
sns.pairplot(df)
plt.show()

# 单列特征统计分布分析
sns.distplot(df["age"])
plt.show()

# 单列特征与标签关系
# 箱型图
sns.boxplot(x=df["target"], y=df["age"])
plt.show()

# 小提琴图
sns.violinplot(x=df.target, y=df.age)
plt.show()
```

## 数据预处理
```
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

df = pd.read_csv('./data/raw.csv')

# 增加可阅读的列名
df.columns = [
    "age",
    "sex",
    ...
]

# 定类类型编码
df["sex"][df["sex"] == 0] = 'female'
df["sex"][df["sex"] == 1] = 'male'
...

# 离散类的 定类、定序 转成OneHotEncoding
df = pd.get_dummies(df)

# 导出为处理后的数据 raw -> data
df.to_csv("./data/data.csv", index=False)

```

## 分训练及测试集，选择模型进行训练和验证评估
```
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("./data/data.csv")

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

model = RandomForestClassifier(max_depth=5, n_estimators=50, random_state=5)
model.fit(X_train, y_train)

```

## 模型权重可视化（optional）
```
# 接上节
from sklearn.tree import export_graphviz
import eli5

estimator = model.estimators_[0]

# target标签可读化
feature_names = X_train.columns
y_train_str = y_train.astype("str")
y_train_str[y_train_str == "0"] = "no disease"
y_train_str[y_train_str == "1"] = "disease"
y_train_str = y_train_str.values

export_graphviz(
    estimator, out_file="./temp/tree.dot", 
    feature_names=feature_names, class_names=y_train_str, 
    rounded=True, proportion=True, label="root", precision=2, filled=True
)

# 权重影响可视化
eli5.show_weights(estimator, feature_names=feature_names.to_list())
```

## 测试集验证及评估
```
# model接训练集章节
from sklearn.metrics import confusion_matrix, roc_curve, auc

# 模型在测试集上的预测
y_pred = model.predict(X_test)

# 混淆矩阵
matrix = confusion_matrix(y_test, y_pred)

# roc曲线
y_pred_quant = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_quant)

# auc面积
auc(fpr, tpr)
```

## Permutation Importance
```
# model接训练集章节
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names=X_test.columns.tolist())
```

## 可解释性
先验分布 -> 模型预测结果（因果性），与先验作比较 -> PDP/ICE
```
# model接训练集章节
from pdpbox import info_plots, pdp

base_feature_names = X.columns.tolist()

# 单一特征的先验分布 info_plots.target_plot()
fig, axes, summary_df = info_plots.target_plot(
    df=df, feature="sex_male", feature_name="gender", target=["target"]
)
_ = axes["bar_ax"].set_xticklabels(["Female", "Male"])

# 特征两两交互影响（先验） info_plots.target_plot_interact()
feat_name1 = "num_major_vessels"
nick_name1 = "num_vessels"
feat_name2 = "max_heart_rate_achieved"
nick_name2 = "max_heart_rate"

fig, axes, summary_df = info_plots.target_plot_interact(df=df, features=[feat_name1, feat_name2], feature_names=[nick_name1, nick_name2], target=["target"])
_ = axes["value_ax"].set_xticklabels(["0", "1", "2"])


# 单一特征，对模型预测的影响 info_plots.actual_plot()
fig, axes, summary_df = info_plots.actual_plot(
    model=model, X=X_train, feature="sex_male", feature_name="gender", predict_kwds={}
)
_ = axes["bar_ax"].set_xticklabels(["Female", "Male"])


# PDP图，绘制所有单个特征对应的PDP图 pdp.pdp_isolate(), pdp.pdp_plot()
for feature_name in base_feature_names:
    pdp_dist = pdp.pdp_isolate(
        model=model, dataset=X_test, model_features=base_feature_names, feature=feature_name
    )
    fig, axes = pdp.pdp_plot(pdp_dist, feature_name)
    plt.show()

# PDP图，两个特征对应的PDP图 pdp.pdp_interact(), pdp.pdp_interact_plot()
feature_names = ["max_heart_rate_achieved", "num_major_vessels"]
nick_names = ["max_heart_rate", "num_vessels"]

inter1 = pdp.pdp_interact(
    model=model, dataset=X_test, model_features=base_feature_names, features=feature_names
)

fig, axes = pdp.pdp_interact_plot(
    pdp_interact_out=inter1, feature_names=nick_names, plot_type="contour", x_quantile=True, plot_pdp=True
)


# ICE图，pdp.pdp_isolate(), pdp.pdp_plot()
pdp_dist = pdp.pdp_isolate(
    model=model, dataset=X_test, model_features=base_feature_names, feature=feature_names
)
fig, axes = pdp.pdp_plot(pdp_dist, nick_name, plot_lines=True, frac_to_plot=0.8, plot_pts_dist=True)
```