In [124]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [125]:
iris = load_iris()
data = pd.DataFrame(data = iris.data, columns = iris.feature_names)
data['target'] = pd.Series(iris.target)
X = data.iloc[::-1].values
y = data.iloc[:,-1].values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)


In [126]:
class Node():
    def __init__(self, feature_index = None, threshold = None,
                 left = None, right = None, info_gain = None, value = None):
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        # for left node
        self.value = value

In [127]:
class DecisionTree():
    def __init__(self, min_samples_split = 2, max_depth = 2):
        self.root = None
        
        self.min_samples_split = min_samples_split # if node only have 2 samples data, consider it as leaf node
        self.max_depth = max_depth # max depth of decision tree
    
    def build_tree(self, dataset, cur_depth):
        X, y = dataset[:,:-1], dataset[:,-1]
        n_samples, n_features = np.shape(X)
        
        if n_samples >= self.min_samples_split and cur_depth <= self.max_depth:
            best_split = self.get_best_split(dataset)
            
            if best_split['info_gain'] > 0:
                left_sub_tree = self.build_tree(best_split['dataset_left'], cur_depth + 1)
                right_sub_tree = self.build_tree(best_split['dataset_right'], cur_depth + 1)
                return Node(best_split['feature_index'], best_split['threshold'],
                           left_sub_tree, right_sub_tree, best_split['info_gain'])
        
        leaf_value = self.calculate_leaf_value(y)
        return Node(value = leaf_value)
    
    def get_best_split(self, dataset):
        X, y = dataset[:,:-1], dataset[:,-1]
        n_samples, n_features = np.shape(X)
        
        best_split = {}
        max_info_gain = -float('inf')
        for feature_index in range(n_features):
            feature_values = dataset[:, feature_index]
            all_threshold = np.unique(feature_values)
            
            for threshold in all_threshold:
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                if len(dataset_left) >0 and len(dataset_right) >0:
                    y_left, y_right = dataset_left[:,-1], dataset_right[:,-1]
                    cur_info_gain = self.information_gain(y,y_left, y_right,'gini')
                    
                    if cur_info_gain > max_info_gain:
                        max_info_gain = cur_info_gain
                        best_split['threshold'] = threshold
                        best_split['feature_index'] = feature_index
                        best_split['info_gain'] = cur_info_gain
                        best_split['dataset_left'] = dataset_left
                        best_split['dataset_right'] = dataset_right
        return best_split
            
    def split(self, dataset, feature_index, threshold):
        dataset_left = np.array([row for row in dataset if row[feature_index] <= threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index] > threshold])
        return dataset_left, dataset_right
        
    def information_gain(self, y, y_left, y_right, mode = 'entropy'):
        weight_left = len(y_left)/len(y)
        weight_right = len(y_right)/len(y)
        if mode == 'entropy':
            return self.entropy(y) -(weight_left * self.entropy(y_left)
                               + weight_right * self.entropy(y_right))
        elif mode == 'gini':
            return self.gini(y) - (weight_left * self.gini(y_left)
                             +weight_right * self.gini(y_right))
    
    def gini(self, y):
        labels = np.unique(y)
        gini = 0
        
        for label in labels:
            p_label = len( y[y == label]) / len(y) # probability of each label
            gini += p_label**2
        return 1- gini
    
    def entropy(self, y):
        labels = np.unique(y)
        entropy = 0
        
        for label in labels:
            p_label = len( y[y == label]) / len(y) # probability of each label
            entropy += -p_label * np.log2(p_label)
        return entropy
    
    def calculate_leaf_value(self, Y):
        Y = list(Y)
        return max(Y, key = Y.count)
        
    def fit(self, X, y):
        dataset = np.concatenate((X, y), axis = 1)
        self.root = self.build_tree(dataset,0)
        
    def make_predict(self, x, node):
        # pass x data through tree 
        if node.value != None:
            return node.value
        
        feature_val = x[node.feature_index]
        if feature_val <= node.threshold:
            return self.make_predict(x, node.left)
        else:
            return self.make_predict(x, node.right)
    def predict(self, X_test):
        return np.array([np.array(self.make_predict(x, self.root)) for x in X_test])
        

In [131]:

classifier = DecisionTree(max_depth = 6)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test).reshape(-1,1)

accuracy = (y_test == y_pred).sum() / len(y_test) * 100
print( round(accuracy,2))


100.0
