# Decision Tree Implementation

In [173]:
from sklearn import datasets

In [174]:
df = datasets.load_iris()
X = df.data
Y = df.target

In [175]:
def makeLabelled(column):
    second_limit = column.mean()
    first_limit = 0.5*second_limit
    third_limit = 1.5*second_limit
    for i in range(0,len(column)):
        if(column[i] < first_limit):
            column[i] = 0
        elif (column[i] < second_limit):
            column[i] = 1
        elif (column[i] < third_limit):
            column[i] = 2
        else:
            column[i] = 3
    return column

In [176]:
for i in range(0,X.shape[-1]):
    X[:,i] = makeLabelled(X[:,i])

In [177]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,random_state = 0)

In [178]:
from math import log
import numpy as np

In [179]:
def find_unique_values(x, col):
    return set([row[col] for row in x])

In [180]:
def find_entropy(y):
    dictionary = {}
    entropy = 0
    s = set(y)
    for e in s:
        dictionary[e] = (y == e).sum()
    for key in dictionary:
        prob = dictionary[key]/len(y)
        if(prob != 0):
            entropy += (-1)*prob*log(prob,2)
    return entropy

In [181]:
def find_info_gain(x,y,f):
    parent_entropy = find_entropy(y)
    s = find_unique_values(x,f)
    result = {}
    dictionary = {}
    for e in s:
        rows = (x[:,f] == e)
        dictionary[e] = rows.sum()
        result[e] = find_entropy(y[rows])
    weighted_child_entropy = 0
    for key in result:
        weighted_child_entropy += (dictionary[key]/len(x))*(result[key])
    return parent_entropy - weighted_child_entropy

In [182]:
def base_case_pure_node(y):
    print("Base Case for pure node:-")
    print("Count of",y[0],"=",len(y))
    print("Current Entropy is = 0.0")
    print("Leaf Node Reached")
    print()

In [183]:
def base_case_no_features_left(y):
    print("Base case when no features are present:-")
    s = set(y)
    for e in s:
        print("Count of ",e," =",(y == e).sum())
    print("Current Entropy is ",find_entropy(y))
    print()

In [184]:
def find_best_split(x,y,features):
    max_gain = -1000
    final_feature = -1
    for f in features:
        info_gain = find_info_gain(x,y,f)
        if(info_gain > max_gain):
            max_gain = info_gain
            final_feature = f
    return max_gain, final_feature

In [185]:
def print_current_node(y,max_gain,final_feature):
    print("Normal Node:-")
    s = set(y)
    for e in s:
        print("Count of",e,"=",(y == e).sum())
    print("Current Entropy is ",find_entropy(y))
    print("Splitting on feature ",final_feature," with information gain ",max_gain)
    print()

In [186]:
def partition(x,y,features,final_feature):
    s = set(x[:,final_feature])
    features.remove(final_feature)
    for e in s:
        arr = (x[:,final_feature] == e)
        build_DT(x[arr], y[arr], features)

In [187]:
def build_DT(x, y, features, level):
    #base case for pure node
    if (len(set(y)) == 1):
        base_case_pure_node(y)
        return
    #base case when all features are used in splitting
    if(len(features) == 0):
        base_case_no_features_left(y)
        return
    #Decision to make on which feature to split on
    max_gain, final_feature = find_best_split(x,y,features)
    # Printing current node
    print_current_node(y,max_gain,final_feature)
    #Splitting the data on the basis of maximum gain
    partition(x,y,features,final_feature)

In [192]:
training_data = np.array([
    [0, 3, 0],
    [1, 3, 0],
    [2, 1, 1],
    [2, 1, 1],
    [1, 3, 2],
])
x = np.array(training_data[:,:2])
y = np.array(training_data[:,-1])
features = [0,1]

In [190]:
features = [0,1,2,3]

In [191]:
build_DT(X,Y,features,0)

Normal Node:-
Count of 0 = 50
Count of 1 = 50
Count of 2 = 50
Current Entropy is  1.58496250072
Splitting on feature  3  with information gain  1.35656522266

Base Case for pure node:-
Count of 0 = 49
Current Entropy is = 0.0
Leaf Node Reached

Normal Node:-
Count of 0 = 1
Count of 1 = 10
Current Entropy is  0.439496986922
Splitting on feature  1  with information gain  0.439496986922

Base Case for pure node:-
Count of 1 = 10
Current Entropy is = 0.0
Leaf Node Reached

Base Case for pure node:-
Count of 0 = 1
Current Entropy is = 0.0
Leaf Node Reached

Normal Node:-
Count of 1 = 39
Count of 2 = 5
Current Entropy is  0.510787822954
Splitting on feature  2  with information gain  0.0776949537301

Base Case for pure node:-
Count of 1 = 1
Current Entropy is = 0.0
Leaf Node Reached

Normal Node:-
Count of 1 = 38
Count of 2 = 4
Current Entropy is  0.453716339187
Splitting on feature  0  with information gain  0.00399336146638

Base case when no features are present:-
Count of  1  = 14
Count