In [5]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

In [3]:
X, y = make_moons(n_samples=10000, noise=0.4)

In [6]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=1)

In [8]:
y_train

array([0, 1, 1, ..., 1, 1, 1])

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
tree_clf = DecisionTreeClassifier(random_state=1)


In [26]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': np.arange(3, 10),
    'max_leaf_nodes':np.arange(2,100),
    'criterion':['gini','entropy'],
    'max_features': [None, 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 4],
    'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3]
}

grid_search = GridSearchCV(tree_clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

In [27]:
tree_clf.set_params(**grid_search.best_params_)
grid_search.best_params_
tree_clf.fit(x_train, y_train)

In [28]:
tree_clf_scores = cross_val_score(tree_clf, x_train, y_train, cv=10)
tree_clf_scores.mean()

0.851375

In [29]:
from sklearn.metrics import accuracy_score

y_pred = grid_search.predict(x_val)
accuracy_score(y_val, y_pred)

0.854

[CV] END criterion=entropy, max_depth=8, max_features=sqrt, max_leaf_nodes=5, min_impurity_decrease=0.3, min_samples_leaf=1; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, max_features=sqrt, max_leaf_nodes=5, min_impurity_decrease=0.3, min_samples_leaf=1; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, max_features=sqrt, max_leaf_nodes=5, min_impurity_decrease=0.3, min_samples_leaf=1; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, max_features=sqrt, max_leaf_nodes=5, min_impurity_decrease=0.3, min_samples_leaf=1; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, max_features=sqrt, max_leaf_nodes=5, min_impurity_decrease=0.3, min_samples_leaf=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, max_features=sqrt, max_leaf_nodes=5, min_impurity_decrease=0.3, min_samples_leaf=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, max_features=sqrt, max_leaf_nodes=5, min_impurity_decrease=0.3, min_samples_leaf=2; total time=

# Question 8

In [31]:
from sklearn.model_selection import ShuffleSplit

In [32]:
rs = ShuffleSplit(n_splits=1000, random_state=1)
rs.get_n_splits(X)

1000

In [33]:
rs

ShuffleSplit(n_splits=1000, random_state=1, test_size=None, train_size=None)

In [38]:
from sklearn.model_selection import ShuffleSplit

n_trees = 1000
n_instances = 100

mini_sets = []

rs = ShuffleSplit(n_splits=n_trees, test_size=len(x_train) - n_instances, random_state=42)
for mini_train_index, mini_test_index in rs.split(x_train):
    X_mini_train = x_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [40]:
from sklearn.base import clone

forest = [clone(grid_search.best_estimator_) for _ in range(n_trees)]

accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    
    y_pred = tree.predict(x_val)
    accuracy_scores.append(accuracy_score(y_val, y_pred))

np.mean(accuracy_scores)

0.7935975

In [42]:
Y_pred = np.empty([n_trees, len(x_val)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(x_val)

In [43]:
from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [45]:
accuracy_score(y_val, y_pred_majority_votes.reshape([-1]))

0.86