In [2]:
import numpy as np
import matplotlib.pyplot as plt
from random_forest import RandomForest
from decision_tree import DecisionTree
from logistic_regression import gradient_descent

In [3]:
def accuracy_score(Y_true, Y_predict):
    acc = 0.
    for y_t, y_p in zip(Y_true, Y_predict):
        #print(y_t, ' ', y_p)
        acc += (y_t == y_p)
    return acc / len(Y_true)

In [7]:
def get_decision_tree_accuracy(trainX, trainY, testX, testY):
    dtree = DecisionTree(100)
    dtree.fit(trainX, trainY)
    dtree_predicted = dtree.predict(testX)
   # print(accuracy_score(testY, dtree_predicted))
    return accuracy_score(testY, dtree_predicted)

def get_random_forest_accuracy(trainX, trainY, testX, testY):
    forest = RandomForest(10, 100)
    forest.fit(trainX, trainY)
    forest_predicted = forest.predict(testX)[0]
  #  print(accuracy_score(testY, forest_predicted))
    return accuracy_score(testY, forest_predicted)

def evaluate_performance():
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    ** Note that your implementation must follow this API**
    '''

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')

    X = data[:, 1:]
    Y = np.array(data[:, 0])
    n = X.shape[0]
    folds = 10

    dtree_accuracies = []
    forest_accuracies = []
    log_accuracies = []

    np.random.seed(13)

    for trial in range(4):
        idx = np.arange(n)
        np.random.shuffle(idx)
        X = X[idx]
        Y = Y[idx]

        print("trial", trial + 1)

        train_size = int((folds - 1) / (folds) * len(X))

        trainX = X[:train_size]
        testX = X[train_size:]
        trainY = Y[:train_size]
        testY = Y[train_size:]

        # train decision tree
        dtree_accuracies.append(get_decision_tree_accuracy(trainX, trainY, testX, testY))

        # train random forest
        forest_accuracies.append(get_random_forest_accuracy(trainX, trainY, testX, testY))

        # train logistic regression
        dd = np.array(trainX)
        data_train = np.column_stack((np.ones(dd.shape[0]), dd))
        label_train = [-1 if a == 0 else 1 for a in trainY]
        beta_hat = gradient_descent(data_train, label_train, epsilon=1e-3, l=1, step_size=0.1, max_steps=200)
        
        y_pred = [1 if a >= 0 else -1 for a in data_train.dot(beta_hat)]
        log_accuracies.append(accuracy_score(label_train, y_pred))

    # compute the training accuracy of the model
    mean_decision_tree_accuracy = np.mean(dtree_accuracies)
    stddev_decision_tree_accuracy = np.std(dtree_accuracies)
    mean_log_regression_accuracy = np.mean(log_accuracies)
    stddev_log_regression_accuracy = np.std(log_accuracies)
    mean_random_forest_accuracy = np.mean(forest_accuracies)
    stddev_random_forest_accuracy = np.std(forest_accuracies)

    # make certain that the return value matches the API specification
    results = np.zeros((3, 2))
    results[0, 0] = mean_decision_tree_accuracy
    results[0, 1] = stddev_decision_tree_accuracy
    results[1, 0] = mean_random_forest_accuracy
    results[1, 1] = stddev_random_forest_accuracy
    results[2, 0] = mean_log_regression_accuracy
    results[2, 1] = stddev_log_regression_accuracy
    return results


# Do not modify from HERE...
if __name__ == "__main__":
    results = evaluate_performance()
    print("Decision Tree Accuracy = ", results[0, 0], " (", results[0, 1], ")")
    print("Random Forest Tree Accuracy = ", results[1, 0], " (", results[1, 1], ")")
    print("Logistic Reg. Accuracy = ", results[2, 0], " (", results[2, 1], ")")
# ...to HERE.


trial 1
0.666666666667
0.703703703704
acc of logistic=  0.8791666666666667
trial 2
0.740740740741
0.740740740741
acc of logistic=  0.8791666666666667
trial 3
0.814814814815
0.888888888889
acc of logistic=  0.8625
trial 4
0.888888888889
0.888888888889
acc of logistic=  0.85
Decision Tree Accuracy =  0.777777777778  ( 0.0828173325 )
Random Forest Tree Accuracy =  0.805555555556  ( 0.0843558664736 )
Logistic Reg. Accuracy =  0.867708333333  ( 0.0122810688777 )
