## 分类树

In [3]:
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

导入乳腺癌数据集

In [4]:
cancer = load_breast_cancer()

查看数据的结构

In [5]:
cancer.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

查看特征名称 

In [6]:
cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='<U23')

类别标签

In [7]:
cancer.target_names  

array(['malignant', 'benign'],
      dtype='<U9')

随机划分训练集、测试集，参数random_state是指随机生成器（随机种子），0表示函数输出是固定不变的。

In [8]:
X_train, X_test, y_train, y_test = train_test_split(cancer['data'], 
                                                    cancer['target'],
                                                    random_state=42)

构建分类决策树模型

In [9]:
tree = DecisionTreeClassifier(random_state=0)

使用决策树模型拟合训练数据集

In [10]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

模型得分

In [11]:
print('Train score:{:.3f}'.format(tree.score(X_train, y_train)))
print('Test score:{:.3f}'.format(tree.score(X_test, y_test)))

Train score:1.000
Test score:0.930


生成决策树可视化图

In [12]:

dot_data = export_graphviz(tree, out_file=None, 
                           class_names=['严重', '轻微'], 
                           feature_names=cancer.feature_names,
                           impurity=False,
                           filled=True)
graph = graphviz.Source(dot_data)
graph.render("tree")

'tree.pdf'

返回每个样本的预测值的叶子索引

In [13]:
tree.apply(X_test)

array([ 9, 30, 30,  9,  9, 30, 30, 24, 28,  7, 12, 30,  9, 30,  9, 30,  9,
        9,  9, 30,  9,  9, 30,  9,  9, 24,  9, 12,  9, 30,  9,  9, 16,  9,
        9,  9, 20,  9, 30,  9,  9, 30,  9,  9, 24,  9, 13, 12,  9,  9, 30,
       30,  9, 24,  9,  9,  9, 30, 30,  9,  9, 30, 30,  9,  9,  9, 30, 30,
        9,  9, 30, 30,  9, 30,  9, 24,  9, 30,  9, 28, 30,  9,  9, 30, 20,
       30, 29, 30,  9,  9,  9,  9, 30, 12,  9,  9, 30, 30,  9, 30, 30,  9,
       25, 30,  9,  9,  9, 30, 24,  9, 30,  9, 12, 30,  9, 30,  9, 12,  9,
       30, 30,  9,  9, 30,  9, 30, 30,  9,  9, 30, 25, 30,  7,  9, 18, 25,
       29,  9, 24, 30,  9, 30,  9], dtype=int32)

返回树中的决策路径

In [14]:
tree.decision_path(X_test)

<143x31 sparse matrix of type '<class 'numpy.int32'>'
	with 817 stored elements in Compressed Sparse Row format>

特征的重要性

In [15]:
tree.feature_importances_

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.72468105,  0.        ,  0.        ,
        0.01277192,  0.        ,  0.        ,  0.        ,  0.00826156,
        0.        ,  0.        ,  0.01702539,  0.        ,  0.        ,
        0.05899273,  0.12550655,  0.00838371,  0.03452044,  0.00985664,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

## 回归树

In [18]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt

创建数据集

In [24]:
rng = np.random.RandomState(1)
x = np.sort(5 * rng.rand(80, 1), axis=0)
y = np.sin(x).ravel()
y[::5] += 3 * (0.5 - rng.rand(16))

训练模型

In [26]:
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_1.fit(x, y)
regr_2 = DecisionTreeRegressor(max_depth=5)
regr_2.fit(x, y)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [35]:
print(regr_1.score(x, y))
regr_2.score(x, y)

0.762989979743


0.95387251084088009

使用模型进行预测

In [27]:
x_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
y_1 = regr_1.predict(x_test)
y_2 = regr_2.predict(x_test)

绘图

In [29]:
plt.figure()
plt.scatter(x, y, s=20, edgecolor="black", c="darkorange", label="data")
plt.plot(x_test, y_1, color="cornflowerblue", 
         label="max_depth=2", linewidth=2)
plt.plot(x_test, y_2, color="yellowgreen", 
         label="max_depth=5", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()