# 概述
在测试集上进行预测验证，定量模型评估，计算Precision、Recall、F1-Score等评估指标，绘制ROC曲线

## 导入工具包，预处理后的数据集，构建随机森林模型

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("./data/process_heart.csv")

X = df.drop("target", axis=1)
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

model = RandomForestClassifier(max_depth=5, n_estimators=10, random_state=5)
model.fit(X_train, y_train)

## 查看测试集

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
X_test.head()

## 筛选未知样本

In [None]:
test_sample = X_test.iloc[2]
test_sample = np.array(test_sample).reshape((1, -1))
test_sample.shape

## 预测筛选出的单个未知样本

In [None]:
model.predict(test_sample)

In [None]:
model.predict_proba(test_sample)

## 预测测试集上全部数据

In [None]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
y_pred_proba_1 = model.predict_proba(X_test)[:, 1]

In [None]:
y_pred

In [None]:
y_test

## 混淆矩阵

In [None]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_test, y_pred)
matrix

In [None]:
import itertools

def cnf_matrix_plotter(confusion_matrix, classes):
    plt.imshow(confusion_matrix, interpolation="nearest", cmap=plt.cm.Oranges)
    plt.title("Confusion Matrix")
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    threshold = confusion_matrix.max() / 2
    
    for i, j in itertools.product(range(confusion_matrix.shape[0]), range(confusion_matrix.shape[1])):
        plt.text(j, i, confusion_matrix[i, j], 
                 horizontalalignment="center", 
                 color="white" if confusion_matrix[i, j] > threshold else "black",
                 fontsize=25)
    plt.tight_layout()
    plt.ylabel("True Label (Fact)")
    plt.xlabel("Predicted Label (Prediction)")
    plt.show()

# target: 0 - Healthy, 1 - Disease
cnf_matrix_plotter(matrix, ["Healthy", "Disease"])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=["Healthy", "Disease"]))

## ROC曲线

In [None]:
model.predict_proba(X_test)

In [None]:
model.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import roc_curve, auc

y_pred_quant = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_quant)

fpr

In [None]:
tpr

In [None]:
thresholds

In [None]:
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], ls="--", c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams["font.size"] = 12
plt.title("ROC Curve")
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positiive Rate (Sensitivity)")
plt.grid(True)

In [None]:
auc(fpr, tpr)