Question 7, Chapter 6

In [2]:
import pandas as pd
from sklearn.datasets import make_moons

In [3]:
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train

array([[-0.56413534,  0.29283681],
       [-1.16033479,  0.96512577],
       [-0.06598769, -0.15191052],
       ...,
       [ 0.38876425, -0.78662881],
       [ 2.50492832,  0.21133631],
       [ 0.35428745,  0.74582457]])

In [7]:
X_test

array([[ 0.69945888, -0.8734481 ],
       [ 1.7764418 ,  0.13222334],
       [-1.14450821,  0.24446319],
       ...,
       [ 0.66336269,  0.79833307],
       [-0.6493245 ,  1.19920859],
       [-0.09883144,  0.40961263]])

In [8]:
y_train

array([0, 0, 1, ..., 1, 1, 0])

In [9]:
y_test

array([1, 1, 0, ..., 0, 0, 0])

In [10]:
X_train.shape, y_train.shape

((8000, 2), (8000,))

In [11]:
X_test.shape, y_test.shape

((2000, 2), (2000,))

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [13]:
params = {'max_leaf_nodes': list(range(2, 50)),
          'max_depth': list(range(2, 40))
         }

In [15]:
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), cv=5, param_grid=params, refit=True)

In [16]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                       14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                       24, 25, 26, 27, 28, 29, 30, 31, ...],
                         'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16, 17, 18, 19, 20, 21,
                                            22, 23, 24, 25, 26, 27, 28, 29, 30,
                                            31, ...]})

In [17]:
grid_search.best_estimator_

DecisionTreeClassifier(max_depth=7, max_leaf_nodes=23, random_state=42)

In [18]:
grid_search.best_estimator_.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=7, max_leaf_nodes=23, random_state=42)

In [21]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score

In [22]:
accuracy_score(y_pred, y_test)

0.8735

Question 8, Chapter 6

In [28]:
from sklearn.model_selection import ShuffleSplit

In [42]:
n_instances = 100
mini_sets = []

In [43]:
split = ShuffleSplit(n_splits=1000, test_size=len(X_train)-n_instances, random_state=42)

In [45]:
for mini_train_index, mini_test_index in split.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [48]:
from sklearn.base import clone

In [50]:
forest = [clone(grid_search.best_estimator_) for _ in range(1000)]

In [51]:
accuracy_scores = []

In [52]:
for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)   
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

In [53]:
import numpy as np

In [54]:
np.mean(accuracy_scores)

0.80175

In [56]:
Y_pred = np.empty([1000, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

In [57]:
from scipy.stats import mode

In [59]:
y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [61]:
y_pred_majority_votes = y_pred_majority_votes.reshape([-1])

In [63]:
accuracy_score(y_pred_majority_votes, y_test)

0.8735