# Importing Necessary Libraries

In [110]:
# Dataframes
import numpy as np
import pandas as pd
from copy import deepcopy

# Plots
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8,8)
plt.style.use('ggplot')

# Scikit Learn
from sklearn.cross_validation  import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initializing Parameters

In [111]:
# Path of the folder
path = 'C:/Users/prash/Downloads/ML ALGORITHMS/'

# Number of clusters required
max_depth = 3

# Importing and Cleaning Dataset

In [112]:
# Import Iris Dataset
iris_dataset = pd.read_csv(path + 'DATASETS/' + 'Iris.csv')
# Removing Index Column
iris_dataset = iris_dataset.iloc[:,1:]

# Input Dataframe
X = iris_dataset.iloc[:,:-1]
X = np.array(X)

# Encode the Output labels
Y = iris_dataset.iloc[:,-1]
for i in range(len(Y.unique())):
    Y = Y.replace(Y.unique()[i],i)
Y = np.array(Y)

# Divide into train and test datasets
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)

# Implementation From Scratch

In [113]:
def Gini_Impurity(num_samples_per_class,n_classes,output_size):
    """Compute Gini impurity of a non-empty node.
    Gini impurity is defined as Σ p(1-p) over all classes, with p the frequency of a
    class within the node. Since Σ p = 1, this is equivalent to 1 - Σ p^2.
    """    
    gini_impurity = 1.0 - sum(((n / output_size) ** 2) for n in num_samples_per_class)
    
    return gini_impurity

class Node:
    """A decision tree node."""

    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None
        
        
def best_split_criteria(X, Y, n_classes):
    """Find the best split for a node.
    "Best" means that the average impurity of the two children, weighted by their
    population, is the smallest possible. Additionally it must be less than the
    impurity of the current node.
    To find the best split, we loop through all the features, and consider all the
    midpoints between adjacent training samples as possible thresholds. We compute
    the Gini impurity of the split generated by that particular feature/threshold
    pair, and return the pair with smallest impurity.
    Returns:
        best_idx: Index of the feature for best split, or None if no split is found.
        best_thr: Threshold to use for the split, or None if no split is found.
    """
    # Need at least two elements to split a node.
    m = Y.size
    if m <= 1:
        return None, None

    # Count of each class in the current node.
    num_parent = [np.sum(Y == c) for c in range(n_classes)]

    # Gini of current node.
    best_gini = Gini_Impurity(num_parent,n_classes,m)
    best_idx, best_thr = None, None

    # Loop through all features.
    for idx in range(n_features):
        # Sort data along selected feature.
        thresholds, classes = zip(*sorted(zip(X[:, idx], Y)))

        # We could actually split the node according to each feature/threshold pair
        # and count the resulting population for each class in the children, but
        # instead we compute them in an iterative fashion, making this for loop linear rather than quadratic.
        num_left = [0] * n_classes
        num_right = num_parent.copy()
        # possible split positions
        for i in range(1, m):  
            c = classes[i - 1]
            
            num_left[c] += 1
            num_right[c] -= 1
            
            gini_left = Gini_Impurity(num_left,n_classes,i)
            gini_right = Gini_Impurity(num_right,n_classes,(m-i))

            # The Gini impurity of a split is the weighted average of the Gini impurity of the children
            gini = (i * gini_left + (m - i) * gini_right) / m

            # The following condition is to make sure we don't try to split two
            # points with identical values for that feature, as it is impossible
            # (both have to end up on the same side of a split).
            if thresholds[i] == thresholds[i - 1]:
                continue

            if gini < best_gini:
                best_gini = gini
                best_idx = idx
                # midpoint
                best_thr = (thresholds[i] + thresholds[i - 1]) / 2  

    return best_idx, best_thr


def grow_tree(X, Y, n_classes,max_depth,depth = 0):

    num_samples_per_class = [np.sum(Y == i) for i in range(n_classes)]
    predicted_class = np.argmax(num_samples_per_class)

    node = Node(gini = Gini_Impurity(num_samples_per_class,n_classes,Y.size),
                num_samples = Y.size,
                num_samples_per_class = num_samples_per_class,
                predicted_class = predicted_class)
    
    # Split recursively until maximum depth is reached.
    if depth < max_depth:
        best_idx, best_threshold = best_split_criteria(X, Y, n_classes)
        if best_idx is not None:
            indices_left = X[:, best_idx] < best_threshold
            
            X_left, Y_left = X[indices_left], Y[indices_left]
            X_right, Y_right = X[~indices_left], Y[~indices_left]
            
            node.feature_index = best_idx
            node.threshold = best_threshold
            
            node.left = grow_tree(X_left, Y_left, n_classes,max_depth,depth = depth+1)
            node.right = grow_tree(X_right, Y_right, n_classes,max_depth,depth = depth+1)
            
    return node

In [114]:
n_classes = len(set(Y))  # classes are assumed to go from 0 to n-1
n_features = X.shape[1]


# Training 
tree_model = grow_tree(X, Y, n_classes,max_depth,depth = 0)

# Testing
Y_pred = []
for test in X_test:
    node = tree_model
    while node.left:
        if test[node.feature_index] < node.threshold:
            node = node.left
        else:
            node = node.right
    Y_pred.append(node.predicted_class)

    
# Compute test set accuracy  
acc = accuracy_score(Y_pred, Y_test)
print("Test set accuracy: {:.2f}".format(acc))

Test set accuracy: 1.00


# Scikit Learn Implementation

In [115]:
dt = DecisionTreeClassifier(max_depth = max_depth)

# Fit dt to the training set
dt.fit(X_train, Y_train)

# Predict test set labels
Y_pred = dt.predict(X_test)

# Compute test set accuracy  
acc = accuracy_score(Y_pred, Y_test)
print("Test set accuracy: {:.2f}".format(acc))

Test set accuracy: 0.97


### References:
https://towardsdatascience.com/decision-tree-from-scratch-in-python-46e99dfea775 <br>
https://medium.com/@mathanrajsharma/fundamentals-of-classification-and-regression-trees-cart-e9af0b152503