In [17]:
import pandas as pd, numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier as cart_clf
from sklearn.model_selection import train_test_split
from sklearn.tree._tree import TREE_LEAF
from sklearn.metrics import accuracy_score

data = pd.read_csv('spam.data.csv', delimiter = ' ', header = None)
X, y = data.iloc[:, :-1].values, data.iloc[:, -1:].values.flatten()

In [18]:
cart_model = cart_clf(random_state=0)

### Part b and c

In [19]:
def countLeafNodes(tree, root, count) :
    left = t.children_left[root]
    right = t.children_right[root]
    
    if left == right == TREE_LEAF :
        return count + 1
    elif left == TREE_LEAF != right:
        return countLeafNodes(t, right, count)
    elif left != TREE_LEAF == right:
        return countLeafNodes(t, left, count)
    else :
        return countLeafNodes(t, left, count) + countLeafNodes(t, right, count)
    
# prune leaves at a given depth
def pruneLeaves(tree, root, rem_depth) :
    if root == TREE_LEAF : return
    
    if rem_depth == 0 :
        tree.children_left[root] = tree.children_left[root] = TREE_LEAF
        return
    
    left = tree.children_left[root]
    right = tree.children_right[root]
    
    pruneLeaves(tree, left, rem_depth - 1)
    pruneLeaves(tree, right, rem_depth - 1)
    
def cross_validation(X, y, size ) :
    scores = []
    for i in range(size) :
        X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 1.0/size) #random selection

        cart_model.fit(X_train, y_train)

        y_pred = cart_model.predict(X_test)
        scores.append(accuracy_score(y_test,y_pred))
    np_score = np.hstack(scores)
    return 1.0 - np.mean(np_score)

In [20]:
#original
print ("original score {}".format(round((cross_validation(X, y, size = 10)),3)))
max_depth = cart_model.tree_.max_depth
print("Max depth of tree {}".format(max_depth))
min_score = 1.0
min_index = -1
for i in range(max_depth, 0, -1) :
    score = round(cross_validation(X, y, size = 10),3)
    (min_index, min_score) = (i, score) if score < min_score else (min_index, min_score)
    print("Pruning {} --> {}".format(i, score))
    pruneLeaves( tree=cart_model.tree_, root = 0, rem_depth = i )
    

print("Optimal tree by pruning last {} levels and score : {}".format (max_depth - min_index, min_score))
    


original score 0.092
Max depth of tree 34
34 --> 0.089
33 --> 0.087
32 --> 0.082
31 --> 0.089
30 --> 0.084
29 --> 0.083
28 --> 0.086
27 --> 0.083
26 --> 0.091
25 --> 0.082
24 --> 0.082
23 --> 0.085
22 --> 0.082
21 --> 0.077
20 --> 0.083
19 --> 0.09
18 --> 0.089
17 --> 0.083
16 --> 0.081
15 --> 0.081
14 --> 0.086
13 --> 0.079
12 --> 0.08
11 --> 0.09
10 --> 0.084
9 --> 0.088
8 --> 0.084
7 --> 0.082
6 --> 0.088
5 --> 0.084
4 --> 0.085
3 --> 0.08
2 --> 0.082
1 --> 0.078
Optimal tree by pruning last 13 levels and score : 0.077
