In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
import math

### 2 Decision Trees
2.1 Trees on the Banana Dataset

In [2]:

n_classes = 2
plot_colors = "bry"
plot_step = 0.02

In [3]:
import csv
train = []
for y, x1,x2 in csv.reader(open("data/banana_train.csv")):
    element = [float(y),float(x1),float(x2)]
    train.append(element)

In [4]:
test = []
for y, x1,x2 in csv.reader(open("data/banana_test.csv")):
    element = [float(y),float(x1),float(x2)]
    test.append(element)

In [5]:
np.random.seed(13)
test = np.asarray(test)
train = np.asarray(train)
np.random.shuffle(train)

In [6]:
train_y = train[:,0]
train_X = train[:,[1,2]]

In [7]:
test_y = test[:,0]
test_X = test[:,[1,2]]

In [8]:
mean = train_X.mean(axis=0)
std = train_X.std(axis=0)
train_X = (train_X - mean) / std
train_y[np.where(train_y == -1)] = 0

mean = test_X.mean(axis=0)
std = test_X.std(axis=0)
test_X = (test_X - mean) / std
test_y[np.where(test_y == -1)] = 0

In [9]:
def draw_banana_decision_tree(depth=10,show_graph=False):
    clf = DecisionTreeClassifier(max_depth=depth,criterion="entropy").fit(train_X, train_y)


    x_min, x_max = train_X[:, 0].min() - 1, train_X[:, 0].max() + 1
    y_min, y_max = train_X[:, 1].min() - 1, train_X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    if show_graph == True:
        cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)

        plt.xlabel("feature 1")
        plt.ylabel("feature 2")
        plt.axis("tight")

        for i, color in zip(range(n_classes), plot_colors):
            idx = np.where(train_y == i)
            plt.scatter(train_X[idx, 0], train_X[idx, 1], c=color,
                        cmap=plt.cm.Paired)

        plt.suptitle("Decision surface of a decision tree using paired features")
        plt.legend()
        plt.show()
    
    return clf

In [77]:
draw_banana_decision_tree(depth=10)

Run your code for different depths of decision trees, from 1 through 10, and briefly describe
your observations of the decision surface visualization.

In [78]:
for i in range(1,11):
    drarw_banana_decision_tree(depth=i)

In [91]:
sum(abs(clf.predict(train_X) - train_y))/int(train_X.shape[0])

0.042857142857142858

In [103]:
errors = [[],[]]
for i in range(1,11):
    model = draw_banana_decision_tree(depth=i,show_graph=False)
    errors[0].append(sum(abs(model.predict(train_X) - train_y))/int(train_X.shape[0]))
    errors[1].append(sum(abs(model.predict(test_X) - test_y))/int(test_X.shape[0]))

In [110]:
line_train_error, = plt.plot(range(1,11),errors[0], label="train error")
line_test_error, = plt.plot(range(1,11),errors[1], label="test error")

plt.legend(handles=[line_train_error, line_test_error])
plt.show()

In [117]:
best_depth = errors[1].index(min(errors[1]))+1
print best_depth

6


In [120]:
model = DecisionTreeClassifier(max_depth=best_depth,criterion="entropy").fit(train_X, train_y)
print sum(abs(model.predict(test_X) - test_y))/int(test_X.shape[0])
model = DecisionTreeClassifier(max_depth=best_depth,criterion="gini").fit(train_X, train_y)
print sum(abs(model.predict(test_X) - test_y))/int(test_X.shape[0])

0.124444444444
0.125


In [124]:
features = ["auto","sqrt","log2"]
for feature in features:
    model = DecisionTreeClassifier(max_depth=best_depth,criterion="entropy",max_features=feature).fit(train_X, train_y)
    print sum(abs(model.predict(test_X) - test_y))/int(test_X.shape[0])

0.138333333333
0.131666666667
0.135


### 3 AdaBoost
3.1 Implementation

In [10]:
train_y[np.where(train_y == 0)] = -1
test_y[np.where(test_y == 0)] = -1

In [150]:
# initialize weight with each one equal
weight = np.ones(train_X.shape[0])

err = []
G = []
G_test = []
e_alpha = []
for i in range(0,10):
    model = DecisionTreeClassifier(max_depth=3,criterion="entropy")
    model.fit(train_X, train_y,sample_weight=weight)
    G.append(model.predict(train_X))
    G_test.append(model.predict(test_X))
    err.append(sum(abs(G[i] - train_y)*weight)/sum(weight)/2)
    e_alpha.append(((1-err[i])/err[i]))
    wrong = abs(G[i] - train_y)
    idx = np.where( wrong == 2 )
    weight[idx] = weight[idx] * e_alpha[i]


In [141]:
G_combined = np.zeros(train_X.shape[0])
for i in range(0,len(e_alpha)):
    G_combined = G_combined + G[i] * math.log(e_alpha[i])
G_combined = np.sign(G_combined)
print (sum(abs(G_combined - train_y))/int(train_X.shape[0]))/2

In [147]:
for i in range(0,len(e_alpha)):
    print (sum(abs(G[i] - train_y))/int(train_X.shape[0]))/2

0.188285714286
0.326285714286
0.314
0.390571428571
0.417142857143
0.323428571429
0.386857142857
0.351714285714
0.466
0.417714285714


In [153]:
G_t_combined = np.zeros(test_X.shape[0])
for i in range(0,len(e_alpha)):
    G_t_combined = G_t_combined + G_test[i] * math.log(e_alpha[i])
G_t_combined = np.sign(G_t_combined)
(sum(abs(G_t_combined - test_y))/int(test_X.shape[0]))/2

0.11333333333333333

In [151]:
for i in range(0,len(e_alpha)):
    print (sum(abs(G_test[i] - test_y))/int(test_X.shape[0]))/2

0.190555555556
0.321111111111
0.339444444444
0.413888888889
0.435555555556
0.323888888889
0.384444444444
0.373333333333
0.467222222222
0.418888888889


In [164]:
def draw_banana_decision_tree_with_weight(weight,depth=3,show_graph=True):
    clf = DecisionTreeClassifier(max_depth=depth,criterion="entropy").fit(train_X, train_y,sample_weight=weight)


    x_min, x_max = train_X[:, 0].min() - 1, train_X[:, 0].max() + 1
    y_min, y_max = train_X[:, 1].min() - 1, train_X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    if show_graph == True:
        cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)

        plt.xlabel("feature 1")
        plt.ylabel("feature 2")
        plt.axis("tight")

        for i, color in [(-1, 'b'), (1, 'r')]:
            idx = np.where(train_y == i)
            plt.scatter(train_X[idx, 0], train_X[idx, 1], c=color,
                        cmap=plt.cm.Paired, s=weight*2)

        plt.suptitle("Decision surface of a decision tree using paired features")
        plt.legend()
        plt.show()
    
    return clf

In [165]:
# initialize weight with each one equal
weight = np.ones(train_X.shape[0])

err = []
G = []
G_test = []
e_alpha = []
for i in range(0,10):
    model = draw_banana_decision_tree_with_weight(weight=weight,depth=3,show_graph=True)
    model.fit(train_X, train_y,sample_weight=weight)
    G.append(model.predict(train_X))
    G_test.append(model.predict(test_X))
    err.append(sum(abs(G[i] - train_y)*weight)/sum(weight)/2)
    e_alpha.append(((1-err[i])/err[i]))
    wrong = abs(G[i] - train_y)
    idx = np.where( wrong == 2 )
    weight[idx] = weight[idx] * e_alpha[i]
