In [7]:
# Basic imports

import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import random
from IPython.core.display import display,Image
from string import Template
import IPython.display
import warnings

In [8]:
class Node():
    
    def __init__(self, feature_index, threshold, left, right, info_gain, value):
        
        # values for decision nodes
        self.feature_index = feature_index # index of a feature for which will be asked threshold question
        self.threshold = threshold # example: is value greater than threashold (Yes\No)
        self.left = left # left child of the particular node
        self.right = right # right child of the particular node
        self.info_gain = info_gain # information gain is the reduction in entropy or surprise 
        # by transforming a dataset and is often used in training decision trees.
        # information gain is calculated by comparing the entropy of the dataset before and after a transformation.
        
        # value for leaf node
        self.value = value

In [9]:
class DecesionTree():
    
    def __init__(self, min_samples_split, max_depth):
        self.root = None # init the root
        
        # stopping conditions | node won't be splited if 
        self.min_samples_split = min_samples_split # number of samples less than min_samples_split 
        self.max_depth = max_depth # current_depth greater than max_depth
    
    # recursive function to build the tree
    def build_tree(self, dataset, curr_depth):
        # spliting dataset on X and y and defining num of samples and features
        X, y = dataset[:, :-1], dataset[:, -1]
        num_samples, num_features = np.shape(X)
        
        # check stopping conditions
        if self.min_samples_split <= num_samples and curr_depth <= self.max_depth:
            # find the best split for current node
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if the information gain equals to 0 -> current node is a leaf
            if best_split['info_gain'] > 0:
                # continue recursion for right and left children
                left_node = self.build_tree(best_split['left_dataset'], curr_depth + 1)
                right_node = self.build_tree(best_split['right_dataset'], curr_depth + 1)
                
                # return decision (current) node 
                return Node(
                    best_split['feature_index'],
                    best_split['threshold'],
                    left_node,
                    right_node,
                    best_split['info_gain']
                )
            
        # return leaf (current) node
        leaf_value = self.get_leaf_value(y)
        return Node(value=leaf_value)
    
    # function to get best split for every node in the tree
    def get_best_split(self, dataset, num_samples, num_features):
        # init best_split values and current maximum info value
        best_split = {}
        max_info_gain = -float('inf')
        
        # for each feature get threshold values (unique values of feature) and for each threshold 
        # do the split and check if it is better than current info_gain value
        for feature_index in range(num_features):
            feature = dataset[:, feature_index]
            possible_thresholds = np.unique(feature)
            for threshold in possible_thresholds:
                # get split for current threshold
                left_dataset, right_dataset = self.split(dataset, feature_index, threshold)
                # check if the datasets are not empty
                if len(left_dataset) > 0 and len(right_dataset) > 0:
                    y, left_y, right_y = dataset[:, -1], left_dataset[:, -1], right_dataset[:, -1]
                    # compute information gain
                    curr_info_gain = self.information_gain(y, left_y, right_y, 'gini')
                    # update info_gain and other values if needed
                    if curr_info_gain > max_info_gain:
                        max_info_gain = curr_info_gain
                        best_split['feature_index'] = feature_index
                        best_split['threshold'] = threshold
                        best_split['left_dataset'] = left_dataset
                        best_split['right_dataset'] = right_dataset
                        best_split['info_gain'] = curr_info_gain
                        
        # return best split values
        return best_split
    
    # function to split dataset
    def split(self, dataset, feature_index, threshold):
        left_dataset = np.array([row for row in dataset if row[feature_index] <= threshold])
        right_dataset = np.array([row for row in dataset if row[feature_index] > threshold])
        
        return left_dataset, right_dataset
    
    # function to calculate information gain with entropy\gini index
    def information_gain(self, parent, left_child, right_child, mode='entropy'):
        left_weight = len(left_child) / len(parent)
        right_weight = len(right_child) / len(parent)
        
        # compute gain using gini\entropy index
        if mode == 'gini':
            gain = self.gini_index(parent) - (self.gini_index(left_child) * left_weight
                                              + self.gini_index(right_child) * right_weight)
        else:
            gain = self.entropy(parent) - (self.entropy(left_child) * left_weight
                                              + self.entropy(right_child) * right_weight)
            
    # function to compute gini index
    def gini_index(self, y):
        class_labes = np.unique(y)
        gini = 0
        # for each class take it's samples and divide by all samples, then square it
        for cls in class_labels:
            p_cls = len(y[cls == y]) / len(y)
            gini += p_cls ** 2
        return 1 - gini
    
    # function to compute entropy
    def entropy(self, y):
        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[cls == y]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy
    
    # function to get value for leaves
    def get_leaf_value(self, y):
        y = list(y)
        # gets maximum count for all the classes in y
        return max(y, key=y.count)
    
    # function to print tree
    def print_tree(self, tree=None, indent=" "):
        # indent is a space from the left side | indent = " " * (current_depth + 1)
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)
        else:
            # recursion, firstly - print left subtree, then right
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
        
    # function to train the tree
    def fit(self, X, y):
        dataset = np.concatinate(X, y, axis=1)
        self.root = self.build_tree(dataset)
        
    # function to predict results
    def predict(self, X):
        predictions = [self.make_predictions(row) for row in X]
        return predictions
    
    # recursive function predicts result for a single value
    def make_predictions(self, row, node):
        # check if node is a leaf node -> return value
        if node.value != None:
            return node.value
        feature_val = row[node.feature_index]
        if feature_val <= node.threshold:
            return self.make_prediction(row, node.left_node)
        else:
            return self.make_prediction(row, node.right_node)

In [11]:
# test Decision tree

col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type']
data = pd.read_csv("iris.csv", skiprows=1, header=None, names=col_names)

X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

classifier = DecisionTreeClassifier(min_samples_split=3, max_depth=3)
classifier.fit(X_train,Y_train)
classifier.print_tree()

FileNotFoundError: [Errno 2] No such file or directory: 'iris.csv'