# Decision Tree classifier


### Dataset
#### We used Iris dataset for this task. It contains 150 samples of 3 different species of Iris flowers (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. 

In [551]:
# scikit-learn package
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [552]:
iris = load_iris()
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

### Example of usage

In [553]:
X, y = iris.data, iris.target
X, X_test, y, y_test = train_test_split(X, y, test_size= 0.30)

In [554]:
class Node:
    
    def __init__(self, X, y, gini, feature_index, left, right, threshold):
        self.X = X
        self.y = y
        self.gini = gini
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right


In [555]:
class MyDecisionTreeClassifier:
    
    def __init__(self, max_depth):
        self.max_depth = max_depth
    
    def gini(self, groups, classes):
        '''
        A Gini score gives an idea of how good a split is by how mixed the
        classes are in the two groups created by the split.
        
        A perfect separation results in a Gini score of 0,
        whereas the worst case split that results in 50/50
        classes in each group result in a Gini score of 0.5
        (for a 2 class problem).
        '''
        dictionary_groups = {}
        for element in groups:
            if element not in dictionary_groups:
                dictionary_groups[element] = 0
            dictionary_groups[element] += 1
        # print(dictionary_groups)
        gini = 1 - sum([(dictionary_groups[group]/len(groups))**2 for group in dictionary_groups])
        return gini

    def divide(self, X, y,feature, question):
        """
        
        
        """
        true_X, true_y, false_X, false_y = [], [], [], []
        for index, value in enumerate(X):
            if value[feature] >= question:
                true_X.append(value)
                true_y.append(y[index])
            else:
                false_X.append(value)
                false_y.append(y[index])
        return true_X, true_y, false_X, false_y


    def test_divide(self, X, y,feature, question):
        true_list, false_list = [], []
        for index, value in enumerate(X):
            if value[feature] >= question:
                true_list.append(y[index])
            else:
                false_list.append(y[index])
        return true_list, false_list

    def split_data(self, X, y) -> tuple[int, int]:
        impurity = self.gini(y, 0)
        questions = set()
        for statement in X:
            for feature in range(len(X[0])):
                true_list, false_list = self.test_divide(X, y, feature, statement[feature])
                average_impurity = (len(true_list)/len(X)) * self.gini(true_list, 0) + (len(false_list)/len(X)) * self.gini(false_list, 0)
                gain_impurity = impurity - average_impurity
                questions.add((feature, statement[feature], gain_impurity))
        return sorted(questions, key=lambda x: x[-1])[-1]
    
    def build_tree(self, X, y, depth = 0):
        best_split = self.split_data(X, y)
        if best_split[-1] == 0 or depth >= self.max_depth:
            return Node(X, y, best_split[-1], best_split[0], None, None, None)
        else:
            true_X, true_y, false_X, false_y = self.divide(X, y, best_split[0], best_split[1])
            true_tree = self.build_tree(true_X, true_y, depth + 1)
            false_tree = self.build_tree(false_X, false_y, depth + 1)
            return Node(X, y, best_split[-1], best_split[0], true_tree, false_tree, best_split[-2])
    
    def fit(self, X, y):
        self.tree = self.build_tree(X, y)
        
    def evaluate_probability(self, y_test):
        probabilities = {}
        for element in y_test:
            if element not in probabilities:
                probabilities[element] = 0
            probabilities[element] += 1
        return sorted(probabilities.items(), key=lambda item: item[1])[0][0]

    def predict(self, X_test):
        predictions = []
        for sample in X_test:
            current_node = self.tree
            while True:
                if current_node.left is None:
                    prediction = self.evaluate_probability(current_node.y)
                    predictions.append(prediction)
                    break
                elif sample[current_node.feature_index] >= current_node.threshold:
                    current_node = current_node.left
                else:
                    current_node = current_node.right
        return predictions
    
    def evaluate(self, X_test, y_test):
        predictions = self.predict(X_test)
        correct_predictions = sum([1 for i in range(len(y_test)) if y_test[i] == predictions[i]])
        return correct_predictions / len(y_test)

### Class functions
1) fit - function, which builds tree (build_tree) and sets result into class variable tree
2) build_tree - recursive function which builds composition of Node objects using test_divide and gini function.
3) test_divide - function which tests how good divide and returns information gain index.
4) gini - function which returns gini index.
6) divide - function which divides tree for given feature
7) evaluate_probability - function which returns most common type. 
8) predict - function which returns predictions for given list of features.
9) evaluate - function, which returns how acurate are given predictions


In [556]:
clf = MyDecisionTreeClassifier(10)
clf.fit(X_test, y_test)
print(clf.evaluate(X, y))

0.9333333333333333
