In [29]:
import numpy as np
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
import matplotlib.pyplot as plt
import csv

In [24]:
class randomForest(object):
    def __init__(self, n_estimators, max_depth, max_features, random_seed=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_seed = random_seed
        self.row_indices = []
        self.feature_indices = []
        self.out_of_bag = []
        self.decision_trees = [sklearn.tree.DecisionTreeClassifier(max_depth=None, criterion='entropy') for i in range(n_estimators)]

    def bootstrap(self, num_training, num_features, random_seed = None):
        np.random.seed(seed = self.random_seed)
        for i in range(self.n_estimators):
            total = set(list(range(num_training)))
            row_indices, col_indices = np.random.choice(num_training, num_training), np.random.choice(num_features, int(num_features * self.max_features), replace=False)
            total = total - set(row_indices)
            self.row_indices.append(row_indices)
            self.feature_indices.append(col_indices)
            self.out_of_bag.append(total)
    
    def fit_rforest(self, x, y):
        self.bootstrap(np.shape(x)[0], np.shape(x)[1])
        for i in range(self.n_estimators):
            bsX = x[self.row_indices[i]][:,self.feature_indices[i]]
            bsY = y[self.row_indices[i]]
            self.decision_trees[i].fit(bsX, bsY)

    def accuracy(self, X, y):
        accuracy = []
        for i in range(len(X)):
            predictions = []
            for j in range(self.n_estimators):
                if i in self.out_of_bag[j]:
                    predictions.append(self.decision_trees[j].predict(np.reshape(X[i][self.feature_indices[j]], (1,-1)))[0])
            if len(predictions) > 0:
                accuracy.append(np.sum(predictions == y[i]) / float(len(predictions)))
        return np.mean(accuracy)


In [31]:
csv_file = open('KickStarterData_nb.csv', encoding="utf-8")
csv_reader = csv.reader(csv_file, delimiter=',')

csv_list = list(csv_reader)
#remove one for the feature titles
totalN = len(csv_list) - 1
#remove 2 for the id and the y column
D = len(csv_list[1]) - 2

#remove the feature titles
csv_list.pop(0)
#split into x and y, remove id
nparray = np.array(csv_list)

#shape of X: (331675, 6)
X = nparray[0:totalN,1:]
#shape of y: (331675,)
y = nparray[:, D+1]

#split into training and testing data
train = int(totalN*0.8)
Xtrain = X[:train]
print(np.shape(Xtrain))
ytrain = y[:train]
print(np.shape(ytrain))
Xtest = X[train+1:]
print(np.shape(Xtest))
ytest = y[train+1:]
print(np.shape(ytest))

# n_estimators = 10
# max_depth = 3
# max_features = 0.6
# random_seed = 12345

# random_forest = randomForest(n_estimators, max_depth, max_features, random_seed)
# random_forest.fit_rforest(Xtrain, ytrain)
# accuracy = random_forest.accuracy(Xtest, ytest)

random_forest = sklearn.ensemble.RandomForestClassifier()
random_forest.fit(Xtrain, ytrain)
ypredicted = random_forest.predict(Xtest)

accuracy = sklearn.metrics.accuracy_score(ytest, ypredicted)
print(accuracy)

false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(ytest, ypredicted, pos_label='1')
roc_auc = sklearn.metrics.auc(false_positive_rate, true_positive_rate)
print(roc_auc)

(265340, 6)
(265340,)
(66334, 6)
(66334,)
1.0


ValueError: y_true takes value in {'0', '1'} and pos_label is not specified: either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly.