In [1]:
import pandas as pd
import numpy as np

In [2]:
def loadData(filename):
    data = pd.read_csv(filename, sep='\s',header=None, engine='python')
    X = np.array(data.iloc[:, [0, 1]])
    Y = np.array(data.iloc[:, [2]])
    Y = Y.reshape((len(Y), ))
    return X, Y

In [3]:
X_train, Y_train = loadData('hw3_train.dat')
X_test, Y_test = loadData('hw3_test.dat')

In [4]:
class CART:
    def __init__(self, theta, dim, value = None):
        self.theta = theta
        self.dim = dim
        self.value = value
        self.left = None
        self.right = None

In [5]:
def gini(Y):
    l = Y.shape[0]
    if l == 0:
        return 1
    return 1-(np.sum(Y==1)/l)**2-(np.sum(Y==-1)/l)**2

In [6]:
def decision_stump(x, sep, s, dim):
    return s * np.where(x[:, dim] >= sep, 1, -1)

In [7]:
def create_sep(X, dim):
    sort_X = np.sort(X[:, dim])
    after_sort = [(sort_X[i] + sort_X[i+1])/2 for i in np.arange(len(sort_X)-1)]
    after_sort.append(sort_X[-1]+1e-10)
    return after_sort

In [8]:
def find_best_branch_in_dimension(X, Y, dim):
    sort_X = create_sep(X, dim)
    gini_min = np.inf
    for s in {1, -1}:
        for i in sort_X:
            yhat = decision_stump(X, i, s, dim)
            y_positive = Y[yhat == 1]
            #print(len(y_positive))
            y_negative = Y[yhat == -1]
            gini_coef = len(y_positive) * gini(y_positive) + len(y_negative) * gini(y_negative)
            if gini_coef < gini_min:
                gini_min = gini_coef
                s_final = s
                sep = i
    return gini_min, s_final, sep

In [9]:
def find_best_branch(X, Y):
    gini0, s0, sep0 = find_best_branch_in_dimension(X, Y, 0)
    gini1, s1, sep1 = find_best_branch_in_dimension(X, Y, 1)
    if gini0 <= gini1:
        return gini0, s0, sep0, 0
    return gini1, s1, sep1, 1

In [10]:
def stop_cond(X, Y):
    if np.sum(Y!=Y[0])==0 or X.shape[0]==1 or np.sum(X!=X[0, :])==0:
        return True
    return False

In [11]:
def build_cart(X, Y):
    if stop_cond(X, Y):
        return CART(None, None, Y[0])
    gini, s, sep, dim = find_best_branch(X, Y)
    cart = CART(sep, dim)
    stump = decision_stump(X, sep, s, dim)
    if s == 1:
        left_X = X[stump == -1].copy()
        left_Y = Y[stump == -1].copy()
        right_X = X[stump == 1].copy()
        right_Y = Y[stump == 1].copy()
    else:
        left_X = X[stump == 1].copy()
        left_Y = Y[stump == 1].copy()
        right_X = X[stump == -1].copy()
        right_Y = Y[stump == -1].copy()
    cart.left = build_cart(left_X, left_Y)
    cart.right = build_cart(right_X, right_Y)
    return cart

In [28]:
cart = build_cart(X_train,Y_train)

In [29]:
def internal_node(node):
    if node.value != None:
        #print(node.value)
        return 0
    l = 0
    r = 0
    if node.left != None:
        l = internal_node(node.left)
    if node.right != None:
        r = internal_node(node.right)
    return 1 + l + r

In [30]:
internal_node(cart)

10

In [31]:
def predict(x, cart):
    dim = cart.dim
    if cart.value != None:
        return cart.value
    if x[dim] >= cart.theta:
        return predict(x, cart.right)
    else:
        return predict(x, cart.left)

In [75]:
def test_error(X, Y, cart):
    yhat = predict_vector(X, cart)
    error = (yhat != Y).sum() / len(Y)
    return error

In [76]:
def predict_vector(X, cart):
    m, n = X.shape
    yhat = np.zeros((m,))
    for i in np.arange(m):
        yhat[i] = predict(X[i], cart)
    return yhat

In [77]:
test_error(X_train, Y_train,cart)

0.0

In [78]:
test_error(X_test, Y_test,cart)

0.126

In [70]:
X = X_train
Y = Y_train

In [61]:
def bagging(X, Y):
    row, col = X.shape
    pos = np.random.randint(0, row, (row,))
    return X[pos, :], Y[pos]

In [145]:
def random_forest(X, Y, T):
    carts = []
    for i in range(T):
        Xtemp, Ytemp = bagging(X, Y)
        cart = build_cart(Xtemp, Ytemp)
        carts.append(cart)
    return carts

In [146]:
def err_fun(X, Y, cart):
    row, col = X.shape
    Yhat = np.zeros(Y.shape)
    for i in range(row):
        Yhat[i] = predict(cart, X[i, :])
    return Yhat, np.sum(Yhat!=Y)/row

In [169]:
def question():
    ein = 0
    eout = 0
    error_in = 0
    error_out = 0
    out_iters = 5
    inner_iters = 50
    m, n = X.shape

    for p in np.arange(out_iters):
        carts = random_forest(X, Y, inner_iters)
        l = len(carts)
        yhat1 = np.zeros((m, l))
        yhat2 = np.zeros((len(X_test), l))
        for i in np.arange(l):
            yhat1[:, i] = predict_vector(X, carts[i])
            yhat2[:, i] = predict_vector(X_test, carts[i])
        ein += (yhat1 != Y.reshape((len(Y), 1))).sum() / l / len(Y)
        yhat1_sum = np.where(yhat1.sum(axis=1) >= 0, 1, -1) 
        yhat2_sum = np.where(yhat2.sum(axis=1) >= 0, 1, -1)
        error_in += (yhat1_sum != Y).sum() / len(Y)
        error_out += (yhat2_sum != Y_test).sum() / len(yhat2)
    print(error_in / out_iters)
    print(error_out / out_iters)
    print(ein / out_iters )

In [170]:
question()

0.0
0.0792
0.0526
