In [1]:
import numpy as np
from random_forest import RandomForest
from decision_tree import DecisionTree

In [2]:
# Load Data
data = np.loadtxt('SPECTF.dat', delimiter=',')
print('Dimensions:', np.shape(data))
print('First three rows: \n', data[:3,:])

Dimensions: (267, 45)
First three rows: 
 [[  1.  59.  52.  70.  67.  73.  66.  72.  61.  58.  52.  72.  71.  70.
   77.  66.  65.  67.  55.  61.  57.  68.  66.  72.  74.  63.  64.  56.
   54.  67.  54.  76.  74.  65.  67.  66.  56.  62.  56.  72.  62.  74.
   74.  64.  67.]
 [  1.  72.  62.  69.  67.  78.  82.  74.  65.  69.  63.  70.  70.  72.
   74.  70.  71.  72.  75.  66.  65.  73.  78.  74.  79.  74.  69.  69.
   70.  71.  69.  72.  70.  62.  65.  65.  71.  63.  60.  69.  73.  67.
   71.  56.  58.]
 [  1.  71.  62.  70.  64.  67.  64.  79.  65.  70.  69.  72.  71.  68.
   65.  61.  61.  73.  71.  75.  74.  80.  74.  54.  47.  53.  37.  77.
   68.  72.  59.  72.  68.  60.  60.  73.  70.  66.  65.  64.  55.  61.
   41.  51.  46.]]


In [3]:
def accuracy_score(Y_true, Y_pred):
    '''
    :param Y_true: true labels of test set
    :param Y_pred: predicted labels of test set
    :return: Computes the accuracy of a model.
    '''
    # return accuracy of th
    N = len(Y_true)
    count = 0
    for i in range(N):
        if Y_true[i] == Y_pred[i]:
            count = count+1
    return count/N * 100

def sigmoid(s):
    return 1/(1+np.exp(-s))

def normalized_gradient(X, Y, beta, l):
    X = np.array(X)
    Y = np.array(Y)
    beta = np.array(beta)
    return np.sum([-Y[i] * X[i] * (1 - sigmoid(Y[i]*(beta.T.dot(X[i])))) for i in range(Y.shape[0])], axis=0)/Y.shape[0] + l*beta/Y.shape[0]


def gradient_descent(X, Y, epsilon=1e-6, l=1, step_size=0.1, max_steps=1000):
    beta = np.zeros(X.shape[1])
    mean = np.hstack((0, np.mean(X[:,1:], axis=0)))
    sigma = np.std(X[:,1:], axis = 0) 
    std = np.hstack((1, sigma))
    lam = np.hstack((0, l/(sigma**2)))
    X_scaled = (X-mean)/std
    for _ in range(max_steps):
        grad = normalized_gradient(X_scaled, Y, beta, lam)
        beta = beta-step_size*grad
    beta[0] = beta[0]-np.sum((mean*beta)/std)
    beta[1:] = beta[1:]/std[1:]
    return beta

In [4]:
def evaluate_performance():
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-folds cross validation
    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy
    ** Note that your implementation must follow this API**
    '''

    folds = 10
    X = np.array(data[:, 1:])
    Y = np.array([data[:, 0]]).T
    n = X.shape[0]
    
    dt_accuracies = []
    rf_accuracies = []
    log_accuracies = []
    
    for trial in range(3):
        idx = np.arange(n)
        np.random.seed(13)
        np.random.shuffle(idx)

        X = X[idx]
        Y = Y[idx]

        tree_acc = []
        forest_acc = []
        log_acc = []
        for it in range(folds):
            X_test = X[it::folds, :]
            Y_test = Y[it::folds, :]
    
            X_train = [X[i] for i in range(len(X)) if i % folds != it]
            Y_train = [Y[i] for i in range(len(Y)) if i % folds != it]

            
            # Decision Tree Classifier
            classifier_dt = DecisionTree(15)
            classifier_dt.fit(X_train, Y_train)
            Y_pred = classifier_dt.predict(X_test)
            tree_acc.append(accuracy_score(Y_test, Y_pred))


            # Random Forest Classifier
            classifier_rf = RandomForest(40, 15)
            classifier_rf.fit(X_train, Y_train)
            Y_pred = classifier_rf.predict(X_test)[0]
            forest_acc.append(accuracy_score(Y_test, Y_pred))


            # Logistic Regression Classifier
            X_train = np.array(X_train)
            X_train = np.column_stack((np.ones(len(X_train)), X_train))
            beta_hat = gradient_descent(X_train, Y_train, epsilon=1e-3, l=1, step_size=0.1, max_steps=200)
            Y_pred = [1 if a >= 0 else 0 for a in X_train.dot(beta_hat)]
            log_acc.append(accuracy_score(Y_train, Y_pred))


        dt_accuracies.append(np.mean(tree_acc))
        rf_accuracies.append(np.mean(forest_acc))
        log_accuracies.append(np.mean(log_acc))

    # compute the training accuracy of the models
    meanDecisionTreeAccuracy = np.mean(dt_accuracies)
    stddevDecisionTreeAccuracy = np.std(dt_accuracies)
    
    meanRandomForestAccuracy = np.mean(rf_accuracies)
    stddevRandomForestAccuracy = np.std(rf_accuracies)
    
    meanLogisticRegressionAccuracy = np.mean(log_accuracies)
    stddevLogisticRegressionAccuracy = np.std(log_accuracies)


    stats = np.zeros((3, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    stats[2, 0] = meanLogisticRegressionAccuracy
    stats[2, 1] = stddevLogisticRegressionAccuracy
    return stats

In [6]:
stats = evaluate_performance()
print ("Decision Tree Accuracy = ", stats[0, 0], " (", stats[0, 1], ")")
print ("Random Forest Tree Accuracy = ", stats[1, 0], " (", stats[1, 1], ")")
print ("Logistic Reg. Accuracy = ", stats[2, 0], " (", stats[2, 1], ")")

Decision Tree Accuracy =  72.8015194682  ( 0.883216411521 )
Random Forest Tree Accuracy =  80.7977207977  ( 0.483631695214 )
Logistic Reg. Accuracy =  79.4011065007  ( 0.000141164692413 )
