# Decision Trees Project using iris data set

#Decision Trees are a non-parametric supervised learning method used for both classification and regression tasks. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.

Importing datasets from sklearn

In [1]:
from sklearn import datasets
import pandas as pd

Loading Iris data_sets

In [2]:
iris=datasets.load_iris()

A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns.

In [3]:
df=pd.DataFrame(iris.data)   #Loding data in DATAFrame
df.columns=["sl", "sw", 'pl', 'pw']     # defining label of  Featured coloums in above iris data sets
x=list(df.columns)      
for i in x:
    print(i)

sl
sw
pl
pw


Loading iris.target into a pandas Data Frame 

In [4]:

y=pd.DataFrame(iris.target)
y.columns=["target"]
print(y.head())

   target
0       0
1       0
2       0
3       0
4       0


Function to convert a continuous data into labelled data

In [5]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):                   #Function to convert a continuous data into labelled data
    if (val < boundaries[0]):                   #There are 4 lables  - a, b, c, d
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

Convert all columns to labelled data

In [6]:
#Convert all columns to labelled data

df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')



Concatinating the Target Data of iris dataset with the df(X train dataframe)

In [7]:
df["target"]=iris.target
df.head()

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled,target
0,5.1,3.5,1.4,0.2,b,c,a,a,0
1,4.9,3.0,1.4,0.2,a,b,a,a,0
2,4.7,3.2,1.3,0.2,a,c,a,a,0
3,4.6,3.1,1.5,0.2,a,c,a,a,0
4,5.0,3.6,1.4,0.2,a,c,a,a,0


Deleting all continous Data columns from the df (Data Frame)

In [8]:
del df["sl"]
del df["sw"]
del df["pl"]
del df["pw"]
df.head()

Unnamed: 0,sl_labeled,sw_labeled,pl_labeled,pw_labeled,target
0,b,c,a,a,0
1,a,b,a,a,0
2,a,c,a,a,0
3,a,c,a,a,0
4,a,c,a,a,0


Making a copy of df (dataFrame)

In [9]:
df_new=df.copy()

importing Math for the further calculations

In [10]:
import math

Entropy : A decision tree is built top-down from a root node and involves partitioning the data into subsets that contain instances with similar values (homogeneous). ID3 algorithm uses entropy to calculate the homogeneity of a sample.

In [11]:
# Funtion for calculating  the Entropy of the a particular Node in the Tree

def Entropy(df_new,feature):
    entropy=0
    Total_val=df_new.target.count()
    
    for i in set(df_new["target"]):
        i_group_counts=df_new[df_new["target"]==i].target.count()
        if i_group_counts!=0:
            entropy+=-((i_group_counts/Total_val)*(math.log(i_group_counts/Total_val,2)))
            
    return entropy

In [12]:
Entropy(df_new,'pw_labeled')

1.584962500721156

Gain ratio:In decision tree learning, Information gain ratio is a ratio of information gain to the split information

In [13]:
def Gain_ratio(df_new,feature):
    orig_en=Entropy(df_new,feature)
    info_gain=0
    split_info=0
    ol=df.target.count()
    xn=0
    
    for i in set(df_new[feature]):
        op=df_new[df_new[feature]==i]
        lp=op.target.count()
        split_info+=-(lp/ol)*math.log(lp/ol,2)
        xn+=(lp/ol)*Entropy(op,i)
    if split_info==0:
        return 0
    else:
         return (orig_en-xn)/split_info

In [14]:
def build_tree(df_new, unused_features,level):
    if len(set(df_new["target"]))==1:
        print("level:",level)
        for i in set(df_new.target):
            print("count of ",str(i) ,"=",df_new[df_new['target']==i].target.count())
        print("Current Entropy is = ",0.0)   
        print("Reached leaf Node")
        print()
        return 
    if len(unused_features)==0:
        print("level:",level)
        for i in set(df_new.target):
            print("count of ",str(i) ,"=",df_new[df_new['target']==i].target.count())
        print("Current Entropy is = ",0.0)   
        print("Reached leaf Node")
        print()
        return 
    best_feature = ""
    maxgain=-1000
    for f in unused_features:
        possible_values = set(df_new[f])
        for val in possible_values:         # loop over possible values : val
            sd=df_new[df_new[f]==val]     # find subset of df & y with f == val
            gain=Gain_ratio(df_new,f)
            if gain>maxgain:
                maxgain=gain
                best_feature=f
                
        # find number of mistakes in this subset 
        # if we predict the most common y as the output
        # find sum of all these mistakes
        # update best feature so that that particular feature
        # makes least number of mistakes
    
    # here you should know the best feature
    # print it out
    
    print("level=",level)
    
    for i in set(df_new.target):
        print("count of ",str(i) ,"=",df_new[df_new['target']==i].target.count())
    print("Current Entropy is ",Entropy(df_new,best_feature))
    print("splitting on feature ", best_feature," with gain ratio ",maxgain)
    print()
    
    unused_features.remove(best_feature)
    
    for i in set(df_new[best_feature]):
        sp=df_new.loc[df_new[best_feature]==i]
        print("No. of ",i,df_new[df_new[best_feature]==i].target.count())
        build_tree(sp,unused_features,level+1)
        print() 


In [15]:
build_tree(df_new,list(df_new.columns[:-1]),0)

level= 0
count of  0 = 50
count of  1 = 50
count of  2 = 50
Current Entropy is  1.584962500721156
splitting on feature  pw_labeled  with gain ratio  0.699638203622209

No. of  d 34
level: 1
count of  2 = 34
Current Entropy is =  0.0
Reached leaf Node


No. of  a 50
level: 1
count of  0 = 50
Current Entropy is =  0.0
Reached leaf Node


No. of  b 10
level: 1
count of  1 = 10
Current Entropy is =  0.0
Reached leaf Node


No. of  c 56
level= 1
count of  1 = 40
count of  2 = 16
Current Entropy is  0.863120568566631
splitting on feature  pl_labeled  with gain ratio  0.8228359169584532

No. of  d 8
level: 2
count of  2 = 8
Current Entropy is =  0.0
Reached leaf Node


No. of  b 1
level: 2
count of  1 = 1
Current Entropy is =  0.0
Reached leaf Node


No. of  c 47
level= 2
count of  1 = 39
count of  2 = 8
Current Entropy is  0.6581912658132185
splitting on feature  sl_labeled  with gain ratio  0.5480381479602677

No. of  d 2
level: 3
count of  1 = 2
Current Entropy is =  0.0
Reached leaf Node


In [16]:
def bestfeature(df_new, unused_features):
    if len(set(df_new["target"]))==1:
        return -1
    if len(unused_features=0):
        return -1
    best_feature = ""
    maxgain=-1000
    for f in unused_features:
        possible_values = set(df_new[f])
        for val in possible_values:
            sd=df_new[df_new[f]==val]     
            gain=Gain_ratio(df_new,f)
            if gain>maxgain:
                maxgain=gain
                best_feature=f
    unused_features.remove(best_feature)
    for i in set(df_new[best_feature]):
        sp=df_new.loc[df_new[best_feature]==i]
#         print("No. of ",i,df_new[df_new[best_feature]==i].target.count())
        bestfeature(sp,unused_features)
#         print() 

In [17]:
class TreeNode:
    def __init__(self,df):
        self.dataframe=df
        self.splits=[]
        self.counts={}
        self.entropy=0
        self.gain_ratio=0
        self.bestfeature=None
        self.unused_features=None
    def Bestfeature_cls(self,df_new,unused_features):
        if len(set(self.dataframe["target"]))==1:
            return "",0
        if len(unused_features)==0:
            return "",0
        best_feature = ""
        maxgain=-1000
        for f in unused_features:
            possible_values = set(self.dataframe[f])
            for val in possible_values:
                sd=self.dataframe[self.dataframe[f]==val]     
                gain=self.Gain_ratio_cls(f)
                if gain>maxgain:
                    maxgain=gain
                    best_feature=f
        return best_feature,maxgain
    def Entropy_cls(self,df_new,feature):
        self.entropy=0
        Total_val=df_new.target.count()

        for i in set(df_new["target"]):
            i_group_counts=df_new[df_new["target"]==i].target.count()
            if i_group_counts!=0:
                self.entropy+=-((i_group_counts/Total_val)*(math.log(i_group_counts/Total_val,2)))

        return self.entropy
    def Gain_ratio_cls(self,feature):
        orig_en=Entropy(self.dataframe,feature)
        info_gain=0
        split_info=0
        ol=self.dataframe.target.count()
        xn=0
        for i in set(self.dataframe[feature]):
            op=self.dataframe[self.dataframe[feature]==i]
            lp=op.target.count()
            split_info+=-(lp/ol)*math.log(lp/ol,2)
            xn+=(lp/ol)*(self.Entropy_cls(op,i))
        if split_info==0:
            return 0
        else:
             return (orig_en-xn)/split_info


In [18]:
def makeTree(df_data,unused_featuress):
    root=TreeNode(df_data)
    root.bestfeature,root.gain_ratio=root.Bestfeature_cls(df_data,unused_featuress)
    if root.gain_ratio==0:
        for cls in set(root.dataframe['target']):
            root.counts[cls]=(root.dataframe[root.dataframe['target']==cls]).target.count()
        return root
    root.unused_features=unused_featuress
    root.entropy=root.Entropy_cls(df_data,root.bestfeature)
    
    for cls in set(root.dataframe['target']):
        root.counts[cls]=(root.dataframe[root.dataframe['target']==cls]).target.count()
        
    unused_featuress.remove(root.bestfeature)
    
    for group in set(root.dataframe[root.bestfeature]):
        groupdata=root.dataframe[root.dataframe[root.bestfeature]==group]
        split=makeTree(groupdata,unused_featuress)
        root.splits.append(split)
    return root
            

In [19]:
root=makeTree(df_new,list(df_new.columns[:-1]))
def print_Tree(root,level):
    if root==None or len(set(root.dataframe["target"]))==1:
        print("level:",level)
        for i in root.counts:
            print("count ",i,'=',root.counts[i])
        print("Current Entropy is = ",0.0)   
        print("Reached leaf Node")
        print()
        return 
    if root.unused_features is None:
        print("level:",level)
        for i in root.counts:
            print("count ",i,'=',root.counts[i])
        print("Current Entropy is = ",0.0)   
        print("Reached leaf Node")
        print()
        return 
    print('level',level)
    for i in root.counts:
        print("count ",i,'=',root.counts[i])
        
    print('Current Entropy is :',root.entropy)
    print("splitting on feature ", root.bestfeature," with gain ratio ",root.gain_ratio)
    print()
    for split in root.splits:
        print_Tree(split,level+1)

In [20]:
print_Tree(root,0)

level 0
count  0 = 50
count  1 = 50
count  2 = 50
Current Entropy is : 1.584962500721156
splitting on feature  pw_labeled  with gain ratio  0.699638203622209

level: 1
count  2 = 34
Current Entropy is =  0.0
Reached leaf Node

level: 1
count  0 = 50
Current Entropy is =  0.0
Reached leaf Node

level: 1
count  1 = 10
Current Entropy is =  0.0
Reached leaf Node

level 1
count  1 = 40
count  2 = 16
Current Entropy is : 0.863120568566631
splitting on feature  pl_labeled  with gain ratio  0.4334099495621066

level: 2
count  2 = 8
Current Entropy is =  0.0
Reached leaf Node

level: 2
count  1 = 1
Current Entropy is =  0.0
Reached leaf Node

level 2
count  1 = 39
count  2 = 8
Current Entropy is : 0.6581912658132185
splitting on feature  sl_labeled  with gain ratio  0.12674503775809332

level: 3
count  1 = 2
Current Entropy is =  0.0
Reached leaf Node

level: 3
count  2 = 1
Current Entropy is =  0.0
Reached leaf Node

level: 3
count  1 = 14
Current Entropy is =  0.0
Reached leaf Node

level 3


In [21]:
print(list(df_new.columns[:-1]))

['sl_labeled', 'sw_labeled', 'pl_labeled', 'pw_labeled']
