In [1]:
import numpy as np
import scipy

from sklearn.base import clone
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
X, y = make_moons(n_samples=10000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [3]:
idx = np.random.choice(np.arange(len(y_train)), 6000, replace=False)
X_train_sampled = X_train[idx, :]
y_train_sampled = y_train[idx]

parameters = {
    "max_depth": [i for i in range(1, 6)],
    "min_samples_leaf": [i for i in range (1, 6)],
    "max_features": [1, 2],
    "max_leaf_nodes": [2**i for i in range(1, 10)],
}

clf = GridSearchCV(
    DecisionTreeClassifier(), 
    parameters, 
    scoring='accuracy', 
    n_jobs=-1, 
    cv=5, 
) 

clf.fit(X_train_sampled, y_train_sampled)
print(f"best estimator {clf.best_estimator_}, best score {clf.best_score_}")

best estimator DecisionTreeClassifier(max_depth=2, max_features=2, max_leaf_nodes=4), best score 0.8598333333333332


In [4]:
clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5], 'max_features': [1, 2],
                         'max_leaf_nodes': [2, 4, 8, 16, 32, 64, 128, 256, 512],
                         'min_samples_leaf': [1, 2, 3, 4, 5]},
             scoring='accuracy')

In [5]:
y_train_pred = clf.predict(X_train)
accuracy_score(y_train, y_train_pred)

0.8629333333333333

In [6]:
y_test_pred = clf.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.8592

In [7]:
n_clfs = 1000
n_samples = 100

clfs = [clone(clf.best_estimator_) for _ in range(n_clfs)]

rs = ShuffleSplit(n_splits=n_clfs, train_size=n_samples, test_size=1, random_state=42)
for i, (train_index, _) in enumerate(rs.split(X_train)):
    clfs[i].fit(X_train[train_index], y_train[train_index])

In [8]:
def predict(clfs, X):
    predictions = np.zeros((len(X), len(clfs)))
    for i, clf in enumerate(clfs):
        predictions[:, i] = clf.predict(X)
    return scipy.stats.mode(predictions, axis=1).mode

In [9]:
y_train_pred = predict(clfs, X_train)
accuracy_score(y_train, y_train_pred)

0.858

In [10]:
y_test_pred = predict(clfs, X_test)
accuracy_score(y_test, y_test_pred)

0.8608