In [1]:
import numpy as np

In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, entropy=None, information_gain=None, value=None):
        self.feature = feature  # index of the feature column
        self.threshold = threshold  # threshold at which to split the data
        self.left = left
        self.right = right
        self.entropy = entropy  # the entropy of the tree node before split
        # the resultant information gain of the tree node after split
        self.information_gain = information_gain
        self.value = value  # leaf node value, if it's an intermediate tree node, value = None


class DecisionTree:
    def __init__(self, min_sample=2, max_depth=5, criterion="entropy"):
        self.min_sample = min_sample
        self.max_depth = max_depth
        self.root_node = None
        self.criterion = criterion
    
    
    # two cost functions, entropy & gini impurity to measure the purity of each tree node
    @staticmethod
    def entropy(array):
        labels, counts = np.unique(array, return_counts=True)
        count = {label: count for label, count in zip(labels, counts)}
        entropy = 0
        for i in count:
            pct = count[i]/len(array)
            entropy +=  -pct * np.log2(pct)
        return entropy
    
    
    @staticmethod
    def gini_impurity(array):
        labels, counts = np.unique(array, return_counts=True)
        count = {label: count for label, count in zip(labels, counts)}
        n = len(array)
        gini = 1 - sum([(count[i]/n)**2 for i in count])
        return gini
    
    
    # measure the information gain after split
    def get_information_gain(self, parent, left_labels, right_labels):
        n = len(parent)
        if self.criterion == 'entropy':
            children_entropy = self.entropy(left_labels)*len(left_labels)/n + self.entropy(right_labels)*len(right_labels)/n
            gain = self.entropy(parent) - children_entropy
            return gain
        elif self.criterion == 'gini':
            gain = self.gini_impurity(parent) - (self.gini_impurity(left_labels)*len(left_labels)/n + self.gini_impurity(right_labels)*len(right_labels)/n)
            return gain
    
    
    # helper method to split a data & labels into left & right given the feature & threshold
    @staticmethod
    def split(X, y, column_index, threshold):
        left_indices, right_indices = [],[]
        for ix, value in enumerate(X[:,column_index]):
            if value < threshold:
                left_indices.append(ix)
            elif value >= threshold:
                right_indices.append(ix)
        return X[left_indices], X[right_indices], y[left_indices], y[right_indices]
    
    
    # loop through each column and the values to evaluate all possible split and find the best split with the highest information gain
    def find_best_split(self, X, y):
        n_row, n_col = X.shape
        best_information_gain = -999 
        best_split_paramaters = {}
        for col in range(n_col):
            for row in range(n_row):
                threshold = X[row, col]
                left, right, left_labels, right_labels = self.split(X, y, column_index=col, threshold=threshold)
                information_gain = self.get_information_gain(y, left_labels, right_labels) 
                if information_gain > best_information_gain:
                    best_information_gain = information_gain
                    best_split_paramaters = {'feature_index': col,
                                             'threshold': threshold,
                                             'left': left,
                                             'left_labels': left_labels,
                                             'right': right,
                                             'right_labels': right_labels,
                                             'information_gain': information_gain}
                    
        return best_split_paramaters
    
    
    # return the final node
    def leaf_node(self, y):
        counts = np.bincount(y)
        most_common_label = np.argmax(counts)
        return Node(value=most_common_label)
    
    
    # recursive function to build the decision tree
    def grow_tree(self, X, y, depth=0):
        n_row, n_col = X.shape
        # conditions for recursively building the tree
        if n_row > self.min_sample and depth < self.max_depth:
            best_split_param = self.find_best_split(X, y)
            if len(best_split_param['left']) == 0 or len(best_split_param['right']) == 0:
                return self.leaf_node(y)
            left_node = self.grow_tree(best_split_param['left'], best_split_param['left_labels'], depth=depth+1)
            right_node = self.grow_tree(best_split_param['right'], best_split_param['right_labels'], depth=depth+1)
            return Node(
                feature=best_split_param['feature_index'], 
                threshold=best_split_param['threshold'], 
                left=left_node,
                right=right_node,
                entropy=self.entropy(y),
                information_gain=best_split_param['information_gain']
            )
        # return leaf node, or terminal node, if conditions are not met
        return self.leaf_node(y)
    
    
    # build the decision tree based on input data
    def fit(self, X, y):
        self.root_node = self.grow_tree(X,y)
        
        
    # predict method for single instance, i.e. 1D vector
    def predict_single_instance(self, x, tree=None):
        tree = self.root_node if tree == None else tree
        if tree.value == None:
            column_value = x[tree.feature]
            if column_value >= tree.threshold:
                return self.predict_single_instance(x, tree.right)
            elif column_value < tree.threshold:
                return self.predict_single_instance(x, tree.left)
        return tree.value
    
    
    # predict method for multiple instances, i.e. 2D array
    def predict(self, X):
        result = [self.predict_single_instance(row) for row in X]
        return result
            
            

In [3]:
class RandomForest:
    def __init__(self,n_trees=25, min_sample=2, max_depth=5):
        self.n_trees = n_trees
        self.min_sample = min_sample
        self.max_depth = max_depth
        self.trees = []
        
    def get_sample(self, X, y):
        n_row = X.shape[0]
        sample_index = np.random.choice(a=n_row, size=n_row, replace=True)
        return X[sample_index], y[sample_index]
    
    def fit(self, X, y):
        pass
        
        
    
    

In [15]:
np.random.choice(a=5, size=6, replace=False)


ValueError: Cannot take a larger sample than population when 'replace=False'