## Decision Tree

In [1]:
# Generate the moon dataset
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [4]:
X.shape

(10000, 2)

In [5]:
y

array([1, 0, 0, ..., 1, 0, 1])

In [6]:
from sklearn.tree import DecisionTreeClassifier

In [7]:
dt_clf = DecisionTreeClassifier(random_state=42)

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
param_grid = {'max_leaf_nodes': list(range(2, 100)),
              'min_samples_split': [2, 3, 4]}

grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42),
                           param_grid=param_grid, n_jobs=-1, verbose=1)
grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=-1)]: Done 340 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 882 out of 882 | elapsed:    1.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None

In [10]:
grid_search_cv.best_params_

{'max_leaf_nodes': 17, 'min_samples_split': 2}

In [11]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=17,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [12]:
grid_search_cv.score(X_test, y_test)

0.8695

## Grow a forest

生成1000个训练集子集，每个子集包含随机挑选的100个实例：

In [13]:
from sklearn.model_selection import ShuffleSplit

In [14]:
X_train.shape

(8000, 2)

In [16]:
n_trees = 1000   # 1000个训练子集
n_instances = 100   # 每个子集100个实例
mini_sets = []

shuffle_split = ShuffleSplit(n_splits=1000,
                             test_size=len(X_train) - n_instances,
                             random_state=42)
for mini_train_idx, mini_test_idx in shuffle_split.split(X_train):
    X_mini_train = X_train[mini_train_idx]
    y_mini_train = y_train[mini_train_idx]
    mini_sets.append((X_mini_train, y_mini_train))

In [17]:
len(mini_sets)

1000

利用前面得到的最佳超参数值，在每个子集上训练一个决策树。在测试集上评估这1000个决策树。

In [22]:
from sklearn.metrics import accuracy_score

In [19]:
from sklearn.base import clone

forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]

In [23]:
accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

In [26]:
np.mean(accuracy_scores)

0.8054494999999999

In [27]:
y_test.dtype

dtype('int64')

对每个测试实例生成1000个决策树的预测，然后仅保留次数最频繁的预测。

In [28]:
y_test_preds = np.empty((len(X_test), n_trees), dtype=np.uint8)

In [30]:
for idx, tree in enumerate(forest):
    y_test_preds[:, idx] = tree.predict(X_test)

In [31]:
y_test_preds.shape

(2000, 1000)

In [32]:
np.unique(y_test_preds[0], return_counts=True)

(array([0, 1], dtype=uint8), array([ 49, 951]))

In [35]:
y_preds = []
for i in y_test_preds:
    labels, counts = np.unique(i, return_counts=True)
    y_preds.append(labels[np.argmax(counts)])

In [36]:
accuracy_score(y_test, y_preds)

0.872