In [1]:
'''
    TEMPLATE FOR MACHINE LEARNING HOMEWORK
    AUTHOR Eric Eaton, Vishnu Purushothaman Sreenivasan
'''

import numpy as np
from sklearn import tree
from sklearn.metrics import accuracy_score

class BoostedDT:

    def __init__(self, numBoostingIters=100, maxTreeDepth=3):
        '''
        Constructor
        '''
        #TODO
        self.numBoostingIters = numBoostingIters
        self.maxTreeDepth = maxTreeDepth
        self.w = 0
        self.Pred = 0
        self.Beta = 0
        self.model = np.empty((numBoostingIters),dtype=object)
        self.K = 0    

    def fit(self, X, y):
        '''
        Trains the model
        Arguments:
            X is a n-by-d numpy array
            y is an n-dimensional numpy array
        '''
        #TODO
        self.K = np.unique(y).shape[0] #3, # of calsses
        numInst, numFeat = X.shape  
        self.w = np.ones(numInst) / numInst
        epsilon = 0
        pred = np.zeros([self.numBoostingIters,numInst])
        beta = np.zeros([self.numBoostingIters])
        
        for i in range(self.numBoostingIters):
            treeBoost = tree.DecisionTreeClassifier(criterion='entropy',
                                               max_depth=self.maxTreeDepth)
            treeBoost.fit(X,y,sample_weight=self.w)
            pred[i,:] = treeBoost.predict(X)
            wError = np.array([self.w[j] if pred[i,j] != y[j] else 0 for j in np.arange(numInst)]) # numInst x numFeat # the entries are the corresponding weight of misclassified instances, else 0.
            epsilon = wError.sum()
            beta[i] = 0.5 * (np.log((1-epsilon)/epsilon) + np.log(self.K-1))
#            self.w = np.array([self.w[j]*np.exp(-beta[i]) if pred[i,j] == y[j] else self.w[j]*np.exp(beta[i]) for j in np.arange(numInst)]) # numInst x numFeat # if predicted correctly, weight = weight x exp(-beta), else  weight = weight x exp(beta).
            self.w = np.array([self.w[j] if pred[i,j] == y[j] else self.w[j]*np.exp(beta[i]) for j in np.arange(numInst)]) # numInst x numFeat # if predicted correctly, weight = weight x 1, else  weight = weight x exp(beta).
            self.w = self.w / self.w.sum() # normalize the weights to sum to 1
#            print beta[i], accuracy_score(pred[i,:],y)
            self.model[i] = treeBoost
        self.Pred = pred
        self.Beta = beta

    def predict(self, X):
        '''
        Used the model to predict values for each instance in X
        Arguments:
            X is a n-by-d numpy array
        Returns:
            an n-dimensional numpy array of the predictions
        '''
        #TODO
        rtn = 3*np.ones([X.shape[0]])
        arg = np.zeros([self.K])
        
        for j in range(X.shape[0]):
            for guess in range(self.K):
                arg[guess] = np.array([self.Beta[i] if self.model[i].predict(X[j,:].reshape(1,-1)) == guess else 0 for i in range(self.numBoostingIters)]).sum()
            rtn[j] = np.argmax(arg) # return the prdicted class (0, 1 or 2) which has the max arg value
#            print rtn[j]
        return rtn

In [6]:
"""
======================================================
Test the boostedDT against the standard decision tree
======================================================

Author: Eric Eaton, 2014

"""
print(__doc__)

import numpy as np
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from numpy import loadtxt


from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

# load the data set
iris = datasets.load_iris()
X = iris.data
y = iris.target

#===============================================
filename = 'data/challengeTrainLabeled.dat'
data = loadtxt(filename, delimiter=',')
X = data[:,0:10]
y = np.array([data[:, -1]]).T
#===============================================
#
#n,d = X.shape
#nTrain = int(0.5*n)  #training on 50% of the data
#
## shuffle the data
#idx = np.arange(n)
#np.random.seed(13)
#np.random.shuffle(idx)
#X = X[idx]
#y = y[idx]
#
## split the data
#Xtrain = X[:nTrain,:]
#ytrain = y[:nTrain]
#Xtest = X[nTrain:,:]
#ytest = y[nTrain:]
#
## train the decision tree
#modelDT = DecisionTreeClassifier()
#modelDT.fit(Xtrain,ytrain)
#
### train the boosted DT
##modelBoostedDT = BoostedDT(numBoostingIters=1000, maxTreeDepth=4)
##modelBoostedDT.fit(Xtrain,ytrain)
#
## output predictions on the remaining data
#ypred_DT = modelDT.predict(Xtest)
##ypred_BoostedDT = modelBoostedDT.predict(Xtest)
#
## compute the training accuracy of the model
#accuracyDT = accuracy_score(ytest, ypred_DT)
##accuracyBoostedDT = accuracy_score(ytest, ypred_BoostedDT)
#
#print "Decision Tree Accuracy = "+str(accuracyDT)
##print "Boosted Decision Tree Accuracy = "+str(accuracyBoostedDT)
#
#numBoostingIters_range = [100]
#maxTreeDepth_range = [2,3,4]
#
##pipe = Pipeline([('bt', BoostedDT)])
##param_grid = [{'bt__numBoostingIters': numBoostingIters_range,
##               'bt__maxTreeDepth':maxTreeDepth_range}]
##gs = GridSearchCV(estimator=pipe,
##                  param_grid=param_grid,
##                  scoring='accuracy',
##                  cv=2,
##                  n_jobs=1,
##                  verbose=10)
##gs = gs.fit(Xtrain,ytrain)
##print(gs.best_score_)
##print(gs.best_params_)

numBoostingIters_range = [100, 500, 1000]
maxTreeDepth_range = [1,2,3,4,5]

y = y.reshape([y.shape[0],])

from sklearn.cross_validation import StratifiedKFold
kfold = StratifiedKFold(y=y,
                        n_folds=5)
scores = []

for numBoostingIters in numBoostingIters_range:
    for maxTreeDepth in maxTreeDepth_range:
        print ("numBoostingIters: %d, maxTreeDepth: %d" % (numBoostingIters, maxTreeDepth))
        for k, (train, test) in enumerate(kfold):    
            modelBoostedDT = BoostedDT(numBoostingIters=numBoostingIters, maxTreeDepth=maxTreeDepth)
            modelBoostedDT.fit(X[train],y[train])
            ypred_BoostedDT = modelBoostedDT.predict(X[test])
            accuracyBoostedDT = accuracy_score(y[test], ypred_BoostedDT)
            scores.append(accuracyBoostedDT)
#            print "Fold:", k+1, "Class dist:", np.bincount(y[train]), "Accuracy:", accuracyBoostedDT
#            print "Fold:", k+1, "Accuracy:", accuracyBoostedDT
#            print "Boosted Decision Tree Accuracy = "+str(accuracyBoostedDT)
        print ("CV accuracy: %.3f, +-%.3f" % (np.mean(scores), np.std(scores)))
        print ("========================")
            


Test the boostedDT against the standard decision tree

Author: Eric Eaton, 2014


numBoostingIters: 100, maxTreeDepth: 1
CV accuracy: 0.464, +-0.006
%s
numBoostingIters: 100, maxTreeDepth: 2
CV accuracy: 0.619, +-0.156
%s
numBoostingIters: 100, maxTreeDepth: 3
CV accuracy: 0.692, +-0.164
%s
numBoostingIters: 100, maxTreeDepth: 4
CV accuracy: 0.732, +-0.158
%s
numBoostingIters: 100, maxTreeDepth: 5
CV accuracy: 0.758, +-0.151
%s
numBoostingIters: 500, maxTreeDepth: 1
CV accuracy: 0.735, +-0.147
%s
numBoostingIters: 500, maxTreeDepth: 2
CV accuracy: 0.746, +-0.139
%s
numBoostingIters: 500, maxTreeDepth: 3
CV accuracy: 0.759, +-0.134
%s
numBoostingIters: 500, maxTreeDepth: 4
CV accuracy: 0.771, +-0.131
%s
numBoostingIters: 500, maxTreeDepth: 5
CV accuracy: 0.780, +-0.128
%s
numBoostingIters: 1000, maxTreeDepth: 1
CV accuracy: 0.766, +-0.130
%s
numBoostingIters: 1000, maxTreeDepth: 2
CV accuracy: 0.770, +-0.125
%s
numBoostingIters: 1000, maxTreeDepth: 3
CV accuracy: 0.776, +-0.122
%s
numB

In [9]:
"""
======================================================
Test the boostedDT against the standard decision tree
======================================================

Author: Eric Eaton, 2014

"""
print(__doc__)

import numpy as np
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from numpy import loadtxt


from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

# load the data set
iris = datasets.load_iris()
X = iris.data
y = iris.target

#===============================================
filename = 'data/challengeTrainLabeled.dat'
data = loadtxt(filename, delimiter=',')
X = data[:,0:10]
y = np.array([data[:, -1]]).T
#===============================================

numBoostingIters_range = [1000, 1500, 2000]
maxTreeDepth_range = [3,4,5,6,7]

y = y.reshape([y.shape[0],])

from sklearn.cross_validation import StratifiedKFold
kfold = StratifiedKFold(y=y,
                        n_folds=5)
scores = []

for numBoostingIters in numBoostingIters_range:
    scores = []
    for maxTreeDepth in maxTreeDepth_range:
        print ("numBoostingIters: %d, maxTreeDepth: %d" % (numBoostingIters, maxTreeDepth))
        scores = []
        for k, (train, test) in enumerate(kfold):    
            modelBoostedDT = BoostedDT(numBoostingIters=numBoostingIters, maxTreeDepth=maxTreeDepth)
            modelBoostedDT.fit(X[train],y[train])
            ypred_BoostedDT = modelBoostedDT.predict(X[test])
            accuracyBoostedDT = accuracy_score(y[test], ypred_BoostedDT)
            scores.append(accuracyBoostedDT)
#            print "Fold:", k+1, "Class dist:", np.bincount(y[train]), "Accuracy:", accuracyBoostedDT
#            print "Fold:", k+1, "Accuracy:", accuracyBoostedDT
#            print "Boosted Decision Tree Accuracy = "+str(accuracyBoostedDT)
        print ("CV accuracy: %.3f, +-%.3f" % (np.mean(scores), np.std(scores)))
        print ("========================")
            


Test the boostedDT against the standard decision tree

Author: Eric Eaton, 2014


numBoostingIters: 1000, maxTreeDepth: 3
CV accuracy: 0.850, +-0.010
numBoostingIters: 1000, maxTreeDepth: 4
CV accuracy: 0.862, +-0.016
numBoostingIters: 1000, maxTreeDepth: 5
CV accuracy: 0.864, +-0.014
numBoostingIters: 1000, maxTreeDepth: 6
CV accuracy: 0.869, +-0.010
numBoostingIters: 1000, maxTreeDepth: 7
CV accuracy: 0.871, +-0.008
numBoostingIters: 1500, maxTreeDepth: 3
CV accuracy: 0.849, +-0.009
numBoostingIters: 1500, maxTreeDepth: 4
CV accuracy: 0.860, +-0.017
numBoostingIters: 1500, maxTreeDepth: 5
CV accuracy: 0.866, +-0.013
numBoostingIters: 1500, maxTreeDepth: 6
CV accuracy: 0.869, +-0.010
numBoostingIters: 1500, maxTreeDepth: 7
CV accuracy: 0.871, +-0.009
numBoostingIters: 2000, maxTreeDepth: 3
CV accuracy: 0.850, +-0.007
numBoostingIters: 2000, maxTreeDepth: 4
CV accuracy: 0.860, +-0.015
numBoostingIters: 2000, maxTreeDepth: 5
CV accuracy: 0.865, +-0.014
numBoostingIters: 2000, maxTreeDe