In [12]:
import pandas as pd
import numpy as np
import torch

In [3]:
#Data Input
dataset = pd.read_csv('Dataset/HeartDisease.csv')
dataset = dataset.values
dataset_train = dataset[:46000, :]
dataset_test = dataset[46000:, :]
X_train = dataset_train[:, 1:]
Y_train = dataset_train[:, 0]

In [13]:
#Node class
class Node():
    def __init__(self, index = None, threshold = None, left = None, right = None, info_gain = None, value = None):
        self.index = index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        self.value = value

#RandomForest class
class RandomForest():
    def __init__(self, min_samples, max_depth, nb_features):
        self.root = None
        self.min_samples = min_samples
        self.max_depth = max_depth
        self.pos = 0
        self.nb_features = nb_features

    #Return: Node
    def build_tree(self, dataset, pos):

        X = dataset[:, 1:]
        Y = dataset[:, 0]

        num_sample, num_feature = np.shape(X)

        if num_sample > self.min_samples and pos < self.max_depth:
            best_split = self.get_best_split(dataset, num_sample, num_feature)
            if best_split["info_gain"] > 0:
                self.pos = self.pos + 1
                left_subtree = self.build_tree(best_split["left_data"], self.pos)
                right_subtree= self.build_tree(best_split["right_data"], self.pos)

                return Node(index = best_split["index"], threshold = best_split["threshold"], left = left_subtree, right = right_subtree,info_gain = best_split["info_gain"])

        value = self.leaf_value(Y)
        return Node(value = value)

    #Return a dictionary
    def get_best_split(self, dataset, num_sample, num_feature):

        best_split = {}
        max_info_gain = -999999999999
        random_index = torch.randperm(num_feature)[:self.nb_features]
        random_index = random_index + 1
        for index in random_index:
            threshold_values = dataset[:, index]
            unique_values = np.unique(threshold_values)
            for value in unique_values:
                dataset_x, dataset_y = self.split(dataset, index, value)
                if len(dataset_x) > 0 and len(dataset_y) > 0:
                  info_gain = self.get_info_gain(dataset, dataset_x, dataset_y)
                  if info_gain > max_info_gain:
                      max_info_gain = info_gain
                      best_split["index"] = index
                      best_split["threshold"] = value
                      best_split["left_data"] = dataset_x
                      best_split["right_data"] = dataset_y
                      best_split["info_gain"] = info_gain
        return best_split

    def split(self, dataset, index, threshold):
        dataset_x = np.array([row for row in dataset if row[index] <= threshold])
        dataset_y = np.array([row for row in dataset if row[index] > threshold])
        return dataset_x, dataset_y

    def get_info_gain(self, dataset, dataset_x, dataset_y):
        weight_x = len(dataset_x)/len(dataset)
        weight_y = len(dataset_y)/len(dataset)
        return self.gini_index(dataset) - weight_x*self.gini_index(dataset_x) - weight_y*self.gini_index(dataset_y)

    def gini_index(self, dataset):
        data_label = dataset[:, 0]
        result = 0
        labels = np.unique(data_label)
        for label in labels:
            value = len(data_label[data_label == label ])/len(data_label)
            result += value**2

        return 1 - result

    def leaf_value(self, Y):
        Y = list(Y)
        return max(Y, key = Y.count)

    def fit(self, dataset):
        self.root = self.build_tree(dataset, self.pos)

    def make_predictions(self, X):
        return [self.predict(row) for row in X]

    def predict(self, X):
        cur_node = self.root
        while cur_node.value is None:
            if X[cur_node.index - 1] <= cur_node.threshold:
                cur_node = cur_node.left
            else:
                cur_node = cur_node.right
        return cur_node.value

    def print_tree(self, tree=None, indent=" "):

        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.index), "<=", tree.threshold, "?", tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)

In [14]:
#Boostraping Dataset
def bootstrap_data(dataset):
    num_sample, num_feature = np.shape(dataset)
    row_index = []
    for i in range (num_sample):
        a = np.random.randint(num_sample)
        row_index.append(a)
    new_dataset = [dataset[index, :] for index in row_index]
    new_dataset = np.stack(new_dataset)
    return new_dataset

In [18]:
#Model Training
rf_models = []
accuracy = []
for j in range(20):
    min_e = np.random.randint(3, 13)
    max_d = np.random.randint(5, 9)
    nb_features = 5
    model = RandomForest(min_e, max_d, nb_features)
    new_data = bootstrap_data(dataset_train)
    model.fit(new_data)
    rf_models.append(model)

85


In [None]:
#Model Testing
result = []
count = 0
X = dataset_test[:, 1:]
Y = dataset_test[:, 0]
for i in range(len(dataset_test)):
    for nb in range(20):
        result.append(rf_models[nb].predict(X[i]))
    prediction = max(result, key=result.count)
    if prediction == Y[i]:
        count += 1
print(int(count/len(dataset_test)*100))