# P47 决策树-训练决策树分类模型

In [3]:
import numpy as np
import pandas as pd

In [7]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [9]:
np.random.seed(42)
raw_data = make_moons(n_samples=2000, noise=0.25, random_state=42)
data = raw_data[0]
target = raw_data[1]

In [15]:
data.shape

(2000, 2)

In [17]:
target.shape

(2000,)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(data,target)

In [21]:
classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train)

In [23]:
classifier.score(x_test, y_test)

0.902

# P048 决策树-使用决策树训练分类模型

In [26]:
# max_depth树的最大深度
# 如果为None，表示树的深度不限
# 知道所有的叶子节点都是纯净的，即叶子结点中所有的样本点都属于同一个类别

In [28]:
classifier = DecisionTreeClassifier(max_depth=6)
classifier.fit(x_train, y_train)

In [30]:
classifier.score(x_test, y_test)

0.928

# P049 决策树-min_samples_leaf叶结点所需的最小样本数

In [33]:
# 参数：min_sample_leaf叶节点所需的最小样本数
# 整数或者浮点数，默认为1
# 它指定了每个叶子节点包含的最少样本数
# 如果浮点数为（0到1之间），每个叶子结点包含的最少样本数为ceil(min_samples_leaf * n_samples)

In [37]:
classifier = DecisionTreeClassifier(max_depth=6, min_samples_leaf=6)
classifier.fit(x_train, y_train)

In [39]:
classifier.score(x_test, y_test)

0.93

# P050 决策树-网格搜索得到最优模型参数

In [44]:
# GridSearchCV的名字可以拆分为两个部分，GridSearch和CV即网格搜索和交叉验证
# 网格搜索，搜索的是参数，即在指定的参数范围内，按步长依次调整参数，利用调整的参数训练学习器，从所有的参数中找到在验证集上精度最高的参数，是一个训练和比较的过程
# 问题：搜索最优的max_depth, min_sample_leaf参数
# 搜索范围；
# max_depth: np.arange(1, 10)
# min_samples_leaf: np.arange(1, 10)

In [46]:
from sklearn.model_selection import GridSearchCV

In [48]:
params = {
    "max_depth": np.arange(1, 10),
    "min_samples_leaf": np.arange(1, 20)
}

In [50]:
grid_search = GridSearchCV(classifier, param_grid=params, scoring="accuracy", cv=5)

In [54]:
grid_search.fit(x_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [56]:
grid_search.best_params_

{'max_depth': 6, 'min_samples_leaf': 6}