In [None]:
# 1.加载所需模块
import numpy as np
from sklearn import datasets
from sklearn import model_selection as ms
from sklearn import metrics
from sklearn import tree
import matplotlib.pyplot as plt


In [None]:
# 2.添加数据，并查看数据的内容
data = datasets.load_breast_cancer()
#输入数据及结构
print('数据的特征的名称:',data.feature_names)
print('数据的特征信息的格式:',data.data.shape)
#结果数据及结构
print('结果信息的格式:',data.target.shape)
print('结果信息的种类:',np.unique(data.target))

In [None]:
#3.将数据分为训练数据和测试数据
X_train, X_test, y_train, y_test = ms.train_test_split(data.data, data.target, test_size=0.2, random_state=42)
print('训练数据的特征信息的格式:',X_train.shape)
print('训练结果信息的格式:',y_train.shape)
print('测试数据的特征信息的格式:',X_test.shape)
print('测试结果信息的格式:',y_test.shape)

In [None]:
# 4.训练模型
dtc = tree.DecisionTreeClassifier(random_state=42)      #实例化DecisionTreeClassifier训练模型
dtc.fit(X_train,y_train)        #输入训练数据，训练DecisionTreeClassifier训练模型
score=dtc.score(X_train,y_train)
print('模型在训练数据上的准确率:',score)

In [None]:
#5.测试分类器
score1=dtc.score(X_test,y_test)
print('模型在测试数据上的准确率:',score1)

In [None]:
#7.探索决策树的深度对其性能的影响
max_depths = np.array([1, 2, 3, 5, 7, 9, 11])       #不同的决策树的深度
train_score = []
test_score = []

for d in max_depths:
    dtc = tree.DecisionTreeClassifier(max_depth=d, random_state=42)
    dtc.fit(X_train, y_train)
    #决策树在训练数据上的准确率
    score_train=dtc.score(X_train, y_train)
    train_score.append(score_train)
    print("最大深度为",d,"的决策树在训练数据上的准确率:",score_train)
    # 决策树在测试数据上的准确率
    score_test=dtc.score(X_test, y_test)
    test_score.append(score_test)
    print("最大深度为",d,"的决策树在测试数据上的准确率:",score_test)
#通过图像显示出决策树的深度对其性能的影响
plt.figure(figsize=(10, 6))
plt.plot(max_depths, train_score, 'o-', linewidth=3, label='train')
plt.plot(max_depths, test_score, 's-', linewidth=3, label='test')
plt.xlabel('max_depth')
plt.ylabel('score')
plt.ylim(0.85, 1.1)
plt.legend()
plt.show()

In [None]:
# 8.探索决策树的最小样本个数对其性能的影响
min_samples = np.array([2, 4, 8, 16, 32])  # 不同的决策树的最小样本个数
train_score = []
test_score = []

for s in min_samples:
    dtc = tree.DecisionTreeClassifier(min_samples_leaf=s, random_state=42)
    dtc.fit(X_train, y_train)
    # 决策树在训练数据上的准确率
    score_train = dtc.score(X_train, y_train)
    train_score.append(dtc.score(X_train, y_train))
    print("最小样本个数为", s, "的决策树在训练数据上的准确率:", score_train)
    # 决策树在测试数据上的准确率
    score_test = dtc.score(X_test, y_test)
    test_score.append(dtc.score(X_test, y_test))
    print("最小样本个数为", s, "的决策树在测试数据上的准确率:", score_train)

# 通过图像显示出决策树的最小样本个数对其性能的影响
plt.figure(figsize=(10, 6))
plt.plot(min_samples, train_score, 'o-', linewidth=3, label='train')
plt.plot(min_samples, test_score, 's-', linewidth=3, label='test')
plt.xlabel('min_samples_leaf')
plt.ylabel('score')
plt.ylim(0.9, 1)
plt.legend()
plt.show()