# 概述
特征与标签分离，分别构建训练集与测试集

构建随机森林分类模型，并在训练集上训练

可视化单个决策树基模型，分析特征权重
## 导入工具包，预处理后的数据集

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("./data/process_heart.csv")

In [None]:
df.shape

## 划分特征列和标签列

In [None]:
X = df.drop("target", axis=1)
X.shape

In [None]:
y = df.target
y.shape

## 划分训练集和测试集

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

## 构建随机森林模型，在训练集上训练模型

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth=5, n_estimators=10, random_state=5)
model.fit(X_train, y_train)

## 可视化随机森林的一棵决策树

In [None]:
len(model.estimators_)

In [None]:
estimator = model.estimators_[0]
estimator

In [None]:
feature_names = X_train.columns
y_train_str = y_train.astype("str")
y_train_str[y_train_str == "0"] = "no disease"
y_train_str[y_train_str == "1"] = "disease"
y_train_str = y_train_str.values

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(estimator, out_file="./temp/tree.dot", feature_names=feature_names, class_names=y_train_str, rounded=True, proportion=True, label="root", precision=2, filled=True)

In [None]:
from subprocess import call
import os

path = os.path.join('D:\\', 'JupyterNotebook', 'uci', 'heart', 'temp')

call(["dot", "-Tpng", os.path.join(path, "tree.dot"), "-o", os.path.join(path, "tree.png"), "-Gdpi=600"])

In [None]:
from IPython.display import Image

Image(filename="./temp/tree.png")

In [None]:
import eli5

eli5.show_weights(estimator, feature_names=feature_names.to_list())

## 特征重要性分析

In [None]:
model.feature_importances_

In [None]:
print("特征排序：")
feature_names = X_test.columns
feature_importances = model.feature_importances_
indices = np.argsort(feature_importances)[::-1]

[print("feature %s - (%f)" % (feature_names[idx], feature_importances[idx]))  for idx in indices]

In [None]:
plt.figure(figsize=(16, 5))
plt.title("Feature Importance")
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color="b", rotation=90)
plt.show()