Implementation of Decision Trees and Random Forest Algorithms

In [1]:
import numpy as np


Custom Functions 

In [2]:
# some custom functions to help with data handling

def load_csv(data_file_path, class_index=-1):

    handle = open(data_file_path, 'r')
    
    contents = handle.read()
    
    handle.close()
    
    rows = contents.split('\n')
    
    out = np.array([[float(i) for i in r.split(',')] for r in rows if r])

    if(class_index == -1):
        
        classes= out[:,class_index]
        
        features = out[:,:class_index]
        
        return features, classes
    
    elif(class_index == 0):
        
        classes= out[:, class_index]
        
        features = out[:, 1:]
        
        return features, classes

    else:
        
        return out
    

def __info__(x):
    
    if type(x) == list:
        
        r = len(x)
        
        out = np.array(x)
        
        out = out.reshape(r,-1)
        
        n_features = out.shape[1]
        
        y = out[:,-1]
        
        return out, n_features, y
    
    elif type(x) == np.ndarray:
        
        n_features = x.shape[1]
        
        y = x[:,-1]
        
        return x, n_features, y
    
def label(x):
    
    counter = Counter(x)
    
    label = counter.most_common(1)[0][0]
    
    return label



def generate_k_folds(dataset, k):
    
    folds = []
    
    k_ = int(len(dataset)/k)
    
    count = 0
    
    for i in range(1, k_+1):
        
        k_ = k_ * (i)
        
        fold = dataset[count:k_]
        
        count = k_
        
        if fold.shape[0] == 0:
            continue
        
            
        fold = tuple(fold)
        
        folds.append(fold)
    
    return folds

In [3]:
# compute gini impurity and gini gain

def gini_impurity(class_vector):
    
    unique_labels = np.unique(class_vector)
    
    probs_of_lbls = []
    
    tot = len(class_vector)
   
    arr = np.array(class_vector)
    
    for lbl in unique_labels:
        count = (arr == lbl).sum()
        prob = (count/tot)**2
        probs_of_lbls.append(prob)

    impurity = 1 - np.sum(probs_of_lbls)
    
    return impurity


def gini_gain(previous_classes, current_classes):
    
    previous_gini = gini_impurity(previous_classes)
    
    current_gini = gini_impurity(current_classes)
    
    gini_gain = current_gini - previous_gini
    
    
    return gini_gain

In [4]:
# splitting data

def split(x, i, t):
    
    left = np.array([row for row in x if row[i]<=t])
    right = np.array([row for row in x if row[i]>t])
    
    return left, right

In [5]:
# finding best split

def best_split(x):
    
    data, features, labels = __info__(x)
    
    best_gain = 0
  
    best_col = 0
    
    best_col_val = 0
    
    for col in range(features):
        
        col_values = np.unique(data[:, col])
        
        for col_val in col_values:
            
            t, f = split(data, col, col_val)
            
            if len(t) == 0 or len(f) == 0:
                continue
                
            c_classes = np.concatenate((t,f), axis=0)
                
            gain = gini_gain(data, c_classes)
            
            if gain > best_gain:
                
                
                best_col = col
                
                best_gain = gain
                
                best_col_val = col_val
                
                
    return best_col, best_col_val, best_gain



In [6]:
# build decision tree

def build_decision_tree():
    
    dt_root = None
    
    i,j,k = best_split(dat)
    
    if k == 0:
        
        data, features, labels = __info__(dat)
        
        cls_label = label(labels)
        
        dt_root = DecisionNode(0,0,0,class_label=cls_label)
        
    else:
        
        t, f = split(x, i, j)
        
        left = build_tree(t)
        
        right = build_tree(f)
        
        func = lambda feature : feature [column] <= threshold
        
        dt_root = DecisionNode(left, right, func, None)
    
    return dt_root

The Algorithms

In [7]:
# Decision Node

class DecisionNode:

    def __init__(self, left, right, decision_function, class_label=None):
        
        self.left = []
        self.right = []
        self.decision_function = decision_function
        self.class_label = class_label

    def decide(self, feature):

        if self.class_label is not None:
            
            return self.class_label

        elif self.decision_function(feature):
            
            return self.left.append(feature)

        else:
            
            return self.right.append(feature)


In [8]:
# Decision Tree  

class DecisionTree:

    def __init__(self, depth_limit=22):

        self.root = None
        self.depth_limit = depth_limit
    

    def fit(self, features, classes):

        self.root = self.__build_tree__(features, classes)
        
        
    def __build_tree__(self, features, classes, depth=10):
        
        self.depth = 0
        
        classes = np.array([classes]).reshape(-1,1)
        
        r = classes.shape[0]
        
        features = np.array([features]).reshape(r,-1)
        
        x = np.concatenate((features, classes), axis=1)
        
        i,j,k = best_split(x)
    
        if k == 0 and self.depth <= depth:
        
            data, features, labels = __info__(x)

            cls_label = label(labels)
        
            return A(0,0,0,class_label=cls_label)
        
        else:
        
            t, f = split(x, i, j)

            left = build_tree(t)

            right = build_tree(f)
            
            self.depth += 1

            func = lambda feature : feature [column] <= threshold
        
            return A(left, right, func, None)
        

    def classify(self, features):
        
        class_labels = []
        
        if node.class_label != None:
            
            return node.class_label
        
        for row in features:
            
            t, f = node.decision_function(row)
        
        class_labels = t[:,-1]
        
        return class_labels


In [9]:
# Random Forest

class RandomForest:

    def __init__(self, num_trees=200, depth_limit=5, example_subsample_rate=.1,
                 attr_subsample_rate=.3):
        
        self.trees = []
        self.num_trees = num_trees
        self.depth_limit = depth_limit
        self.example_subsample_rate = example_subsample_rate
        self.attr_subsample_rate = attr_subsample_rate
              

    def fit(self, features, classes):
        
        self.trees = []
        
        for _ in range(self.trees):
            
            tree = DecisionTree(depth_limit=self.depth_limit)
            
            tree.fit(features, classes)
            
            self.trees.append(tree)
            
    
    def classify(self, features):
            
            votes = []
            
            
            for tree in self.trees:
                
                labels = np.array([tree.classify(features)]) 
                
                labels = labels.reshape(0,-1)
                
                counter = Counter(labels)
                
                label = counter.most_common(1)[0][0]
                
                votes.append(label)
            
            return votes