In [171]:
# STEP 1 — RANDOM FOREST ALGORITHM:
# a. Split data such that information gain (decrease in entropy after splitting) is high
# b. Split the data using each condition, check the gain returned
# c. Condition with highest gain will be used to make the first split
# d. Keep splitting nodes UNTIL entropy is 0
# e. Classify unknown data point with multiple trees

practice_training_dataset = [[0, 3, 0], 
                             [1, 3, 1], 
                             [2, 1, 2],  
                             [0, 3, 0], 
                             [1, 3, 1], 
                             [2, 1, 2]]

practice_unclassified_instance = [2, 1]

# Entropy: Count of different classes in a list
# Information gain: Difference between count of different classes in the current node 
#                   and the count of different classes in the previous node

# Process:
# 1. Get the entropy value of the root node
# 2. Iterature through the database and split it into two arrays based on a condition
# 3. Get the entropy value of the following nodes, find the difference between the 
#    root node entropy and the subsequent nodes
# 4. Continue to split nodes with nonzero entropy until only leaf nodes remain

import random

# Creates a random forest class
class RandomForest:
    
    # Initializes the attributes of the RandomForest class.
    def __init__(self, dataset, number_of_features):
        self.dataset = dataset
        self.number_of_features = number_of_features
        self.random_seed = 0

        self.instances = []
        self.feature_instances = []
        self.features = []

        self.feature = 0
        self.feature_to_split = 0
        
        self.nodes = []
        self.current_node = []

    # Populates self.feature_instances with arrays of unique instances of each feature in the database
    def set_random_features(self):
        feature = 0
        while feature < self.number_of_features:
            for instance in self.dataset:
                if instance[feature] not in self.instances:
                    self.instances.append(instance[feature])
            pair = [feature, self.instances]
            self.feature_instances.append(pair)
            feature += 1
            self.instances = []

        for feature_number in range(self.number_of_features):
            self.features.append(feature_number)

        self.random_seed = random.randrange(0, 101)
        random.seed(self.random_seed)

        self.feature = self.features.pop(random.choice(self.features))
        self.feature_to_split = random.choice(self.feature_instances[self.feature][1])

        # 1. Get argument: how many features there are, random seed
        # 2. Based on the number of features, use an algo to identify the different instances of that feature
        # 3. Every time new node is called, randomize the feature and how it is split. Once used, remove from lists.


    # Evaluates entropy by caluclating the amount of unique classes in the node
    # and adds node to a list of all nodes
    def create_node(self, current_node):
        unique_labels = []
        for instance in current_node:
            if instance[-1] not in unique_labels:
                unique_labels.append(instance[-1])
        entropy = len(unique_labels) 
        pair = [entropy, current_node]
        self.nodes.append(pair)
        print(self.nodes)
    
    # Splits previous node data into two other nodes based on a condition
    def create_nodes(self):
        
        # Creates the parent node
        self.current_node = self.dataset
        self.create_node(self.current_node)

        # Every time function is called, new nodes are created for existing nodes with nonzero entropy
        node_counter = 0
        entropy_count = 0
        for node in self.nodes:
            if node[0] != 1:
                entropy_count += 1
        
            if entropy_count == 0:
                break
            
            elif self.nodes[node_counter][0] != 1:
                new_node_1 = []
                new_node_2 = []
                for instance in self.nodes[node_counter][1]:
                    if instance[self.feature] == self.feature_to_split:
                        new_node_1.append(instance)
                    if instance[self.feature] != self.feature_to_split:
                        new_node_2.append(instance)
                if new_node_1:
                    self.current_node = new_node_1
                    self.create_node(self.current_node)
                if new_node_2:
                    self.current_node = new_node_2
                    self.create_node(self.current_node)
                node_counter += 1
                self.set_random_features()
            elif self.nodes[node_counter][0] == 1:
                node_counter += 1
                self.set_random_features()


In [178]:
test_rf = RandomForest(practice_training_dataset, 2)
test_rf.set_random_features()
print(test_rf.feature_instances)
print(test_rf.feature)
print(test_rf.feature_to_split)
test_rf.create_nodes()
print(test_rf.nodes)

[[0, [0, 1, 2]], [1, [3, 1]]]
0
1
[[3, [[0, 3, 0], [1, 3, 1], [2, 1, 2], [0, 3, 0], [1, 3, 1], [2, 1, 2]]]]
[[3, [[0, 3, 0], [1, 3, 1], [2, 1, 2], [0, 3, 0], [1, 3, 1], [2, 1, 2]]], [1, [[1, 3, 1], [1, 3, 1]]]]
[[3, [[0, 3, 0], [1, 3, 1], [2, 1, 2], [0, 3, 0], [1, 3, 1], [2, 1, 2]]], [1, [[1, 3, 1], [1, 3, 1]]], [2, [[0, 3, 0], [2, 1, 2], [0, 3, 0], [2, 1, 2]]]]
[[3, [[0, 3, 0], [1, 3, 1], [2, 1, 2], [0, 3, 0], [1, 3, 1], [2, 1, 2]]], [1, [[1, 3, 1], [1, 3, 1]]], [2, [[0, 3, 0], [2, 1, 2], [0, 3, 0], [2, 1, 2]]], [1, [[0, 3, 0], [0, 3, 0]]]]
[[3, [[0, 3, 0], [1, 3, 1], [2, 1, 2], [0, 3, 0], [1, 3, 1], [2, 1, 2]]], [1, [[1, 3, 1], [1, 3, 1]]], [2, [[0, 3, 0], [2, 1, 2], [0, 3, 0], [2, 1, 2]]], [1, [[0, 3, 0], [0, 3, 0]]], [1, [[2, 1, 2], [2, 1, 2]]]]
[[3, [[0, 3, 0], [1, 3, 1], [2, 1, 2], [0, 3, 0], [1, 3, 1], [2, 1, 2]]], [1, [[1, 3, 1], [1, 3, 1]]], [2, [[0, 3, 0], [2, 1, 2], [0, 3, 0], [2, 1, 2]]], [1, [[0, 3, 0], [0, 3, 0]]], [1, [[2, 1, 2], [2, 1, 2]]]]
