In [1]:
import numpy as np
import random
from datetime import datetime

from multiprocessing import cpu_count
import numpy as np
from rerf.RerF import fastPredict, fastPredictPost, fastRerF



In [2]:
class DecisionNode:
    
    #Initialization
        
    def __init__(self, data, max_height, height = 0):
        self.data = data
        self.terminal = False
        self.max_height = max_height
        self.height = height
    
    
    #Printing for debugging
    def dataToString(self):
        print(self.data)
        
    #Split node and create child nodes
    def split(self):
        if self.height == self.max_height or len(self.data) < 10:
            self.terminal = True
            self.prediction = max(set([row[-1] for row in self.data]), key = [row[-1] for row in self.data].count)
        else:
            (group_1, group_2, self.condition, self.feature, gi) = find_best_partition(self.data)
        
            if len(group_1) == 0 or len(group_2) == 0:
                self.terminal = True
                self.prediction = max(set([row[-1] for row in self.data]), key = [row[-1] for row in self.data].count)
            else:
                self.left = DecisionNode(group_1, self.max_height, self.height + 1)
                self.right = DecisionNode(group_2, self.max_height, self.height + 1)
                self.left.split()
                self.right.split()
    
    def predict(self, test_data):
        if self.terminal == True:
            return self.prediction
        else:     
            if isinstance(test_data[self.feature], str):
                if test_data[self.feature] == self.condition:
                    return self.left.predict(test_data)
                else:
                    return self.right.predict(test_data)
            else:
                if test_data[self.feature] <= self.condition:
                    return self.left.predict(test_data)
                else:
                    return self.right.predict(test_data)

In [3]:
#Partitions data depending on condition
def partition(data, feature, condition):
    group_1 = [];
    group_2 = [];
    
    if isinstance(data[0][feature], str):
        for row in data:
            if row[feature] == condition:
                group_1.append(row)
            else:
                group_2.append(row)
    else:
        for row in data:
            if row[feature] <= condition:
                group_1.append(row)
            else:
                group_2.append(row)
    return group_1, group_2


#calculates Gini score of a group
def gini_score(data):
    classifications = [row[-1] for row in data]
    num = len(classifications)
    if num == 0:
        return 1
    conditions = list(set(classifications))
    sum_ = 0
    for i in conditions:
        proportion = classifications.count(i)/num
        sum_ = sum_ + proportion * (1 - proportion)
    return sum_


#find the best split
def find_best_partition(data):
    best_gini = 999;
    
    found = False
    for i in range(len(data[0])-1):
        possible_conditions = list(set([row[i] for row in data]))
        for j in possible_conditions:
            (group_1, group_2) = partition(data, i, j)
            #if len(group_1) == 0 or len(group_2) == 0:
            #    continue
            
            if len(group_1) < 10 or len(group_2) < 10:
                continue
            
            
            gini = gini_score(group_1) * len(group_1)/len(data) + gini_score(group_2) * len(group_2)/len(data)
            if gini < best_gini:
                found = True
                best_gini = gini
                best_group_1 = group_1
                best_group_2 = group_2
                best_condition = j
                best_feature = i
    if found == False:
        best_condition = 1
        best_feature = 1
        best_group_1 = data
        best_group_2 = []
    return (best_group_1, best_group_2, best_condition, best_feature, best_gini)

def build_tree(data, max_depth, min_size):
    root = DecisionNode(data, max_depth)
    root.split()
    return root

In [4]:
class RFDecisionNode:
    
    #Initialization
        
    def __init__(self, data, max_height, n_features, height = 0,):
        self.data = data
        self.terminal = False
        self.max_height = max_height
        self.height = height
        self.n_features = n_features
    
    
    #Printing for debugging
    def dataToString(self):
        print(self.data)
        
    #Split node and create child nodes
    def split(self):
        if self.height == self.max_height or len(self.data) < 10:
            self.terminal = True
            self.prediction = max(set([row[-1] for row in self.data]), key = [row[-1] for row in self.data].count)
        else:
            (group_1, group_2, self.condition, self.feature, gi) = RF_find_best_partition(self.data, self.n_features)

            if len(group_1) == 0 or len(group_2) == 0:
                self.terminal = True
                self.prediction = max(set([row[-1] for row in self.data]), key = [row[-1] for row in self.data].count)
            else:
                self.left = RFDecisionNode(group_1, self.max_height, self.n_features , self.height + 1)
                self.right = RFDecisionNode(group_2, self.max_height, self.n_features, self.height + 1)
                self.left.split()
                self.right.split()

    
    def predict(self, test_data):
        if self.terminal == True:
            return self.prediction
        else:     
            if isinstance(test_data[self.feature], str):
                if test_data[self.feature] == self.condition:
                    return self.left.predict(test_data)
                else:
                    return self.right.predict(test_data)
            else:
                if test_data[self.feature] <= self.condition:
                    return self.left.predict(test_data)
                else:
                    return self.right.predict(test_data)
                

class RF:
    def __init__(self, data, max_height, n_features, n_trees, n_bagging):
        self.data = data
        self.max_height = max_height
        self.n_features = n_features
        self.n_trees = n_trees
        self.n_bagging = n_bagging
         
    def RF_build_tree(self, dat):
        root = RFDecisionNode(dat, self.max_height, self.n_features)
        root.split()
        return root
    
    def create_model(self):
        self.forest = [];
        for i in range(self.n_trees):
            chosen_input = random.sample(list(range(0, len(self.data))), self.n_bagging)
            bag = [];
            for j in chosen_input:
                bag.append(self.data[j])
            
            temp = self.RF_build_tree(bag)
            self.forest.append(temp)

    
    def predict(self, test_data):
        temp_result = [];
        for tree in self.forest:
            temp_result.append(tree.predict(test_data))
        return max(set(temp_result), key = temp_result.count)
            

In [5]:
#find the best split
def RF_find_best_partition(data, n_features):
    best_gini = 999;
    
    
    #Choose random features
    chosen_features = random.sample(list(range(0, len(data[0])-1)), n_features)
    
    found = False;
    
    for i in chosen_features:
        possible_conditions = list(set([row[i] for row in data]))
        
        for j in possible_conditions:
            (group_1, group_2) = partition(data, i, j)
            if len(group_1) < 10 or len(group_2) < 10:
                continue
            #if len(group_1) == 0 or len(group_2) == 0:
            #    continue
            
            gini = gini_score(group_1) * len(group_1)/len(data) + gini_score(group_2) * len(group_2)/len(data)
            if gini < best_gini:
                found = True;
                best_gini = gini
                best_group_1 = group_1
                best_group_2 = group_2
                best_condition = j
                best_feature = i
    if found == False:
        best_condition = 1
        best_feature = 1
        best_group_1 = data
        best_group_2 = []
    return (best_group_1, best_group_2, best_condition, best_feature, best_gini)


In [6]:
#Testing section#

In [7]:
ctg_data = np.loadtxt('ctg.csv', dtype = np.float64, delimiter = ',')

In [26]:
training_set = [];
test_set = [];

for row in ctg_data:
    if random.random() < 0.7:
        training_set.append(row)
    else:
        test_set.append(row)
        
training_set_no_label = np.array(np.delete(training_set, -1, axis = 1))
training_set_labels = np.array(training_set)[:,-1]

test_set_no_label = np.delete(test_set, -1, axis = 1)
test_set_labels = np.array(test_set)[:,-1]

In [27]:
#Training with Decision Tree
startTime = datetime.now()
dec_tree = build_tree(training_set, 5, 20)
print(datetime.now() - startTime)

0:00:12.012729


In [28]:

results = [];

In [29]:
for row in test_set_no_label:
    results.append(dec_tree.predict(row))

results = np.array(results)

count = 0
for i in range(len(results)):
    if results[i] == test_set_labels[i]:
        count += 1

accuracy = count/len(results)
print(accuracy)

0.8134556574923547


In [30]:
startTime = datetime.now()
randForest = RF(training_set, 5, 4, 50, 500)
randForest.create_model()
print(datetime.now() - startTime)

0:00:20.790052


In [31]:
results = []

for row in test_set_no_label:
    results.append(randForest.predict(row))

results = np.array(results)

count = 0
for i in range(len(results)):
    if results[i] == test_set_labels[i]:
        count += 1

accuracy = count/len(results)
print(accuracy)

0.7737003058103975


In [32]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, max_depth=5, max_features = 4)



clf.fit(training_set_no_label, training_set_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features=4, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [33]:
results = []

for row in test_set_no_label:
    a = np.array(row)[np.newaxis]
    results.append(clf.predict(a))

results = np.array(results)

count = 0
for i in range(len(results)):
    if results[i] == test_set_labels[i]:
        count += 1

accuracy = count/len(results)
print(accuracy)

0.7966360856269113


In [34]:
print(training_set_labels)
training_set_labels = np.subtract(training_set_labels, 1)
print(training_set_labels)
test_set_labels = np.subtract(test_set_labels, 1)


[9. 6. 6. ... 5. 5. 1.]
[8. 5. 5. ... 4. 4. 0.]


In [35]:

forest = fastRerF(
    X=training_set_no_label, Y=training_set_labels, forestType="binnedBaseRerF", trees=500, numCores=cpu_count()
)

#forest.printParameters()

test_pred = fastPredict(test_set_no_label, forest)
temp = 1 - np.mean(test_pred != test_set_labels)
print(temp)

0.8685015290519877
