In [1]:
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
from numpy import log2 as log
import sys

In [2]:
# prepare dataset
age = 'youth,youth,middleAged,senior,senior,senior,middleAged,youth,youth,senior,youth,middleAged,middleAged,senior'.split(',')
income = 'high,high,high,medium,low,low,low,medium,low,medium,medium,medium,high,medium'.split(',')
student = 'no,no,no,no,yes,yes,yes,no,yes,yes,yes,no,yes,no'.split(',')
creditRating = 'fair,excellent,fair,fair,fair,excellent,excellent,fair,fair,fair,excellent,excellent,fair,excellent'.split(',')
buysComputer = 'no,no,yes,yes,yes,no,yes,no,yes,yes,yes,yes,yes,no'.split(',')

dataset = {'age':age,'income':income,'student':student,'creditRating':creditRating,'buysComputer':buysComputer}
df = pd.DataFrame(dataset,columns=['age','income','student','creditRating','buysComputer'])
print(df)
attName = ['age', 'income', 'student', 'creditRating']
className = 'buysComputer'

           age  income student creditRating buysComputer
0        youth    high      no         fair           no
1        youth    high      no    excellent           no
2   middleAged    high      no         fair          yes
3       senior  medium      no         fair          yes
4       senior     low     yes         fair          yes
5       senior     low     yes    excellent           no
6   middleAged     low     yes    excellent          yes
7        youth  medium      no         fair           no
8        youth     low     yes         fair          yes
9       senior  medium     yes         fair          yes
10       youth  medium     yes    excellent          yes
11  middleAged  medium      no    excellent          yes
12  middleAged    high     yes         fair          yes
13      senior  medium      no    excellent           no


In [3]:
# Calculate Entropy of dataset
def findEntropy(df):
    #print('Calculating entropy..........')
    #print(f'df (findEntropy): \n{df}')
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    entropy = 0
    values = df[Class].unique()

    for value in values:
        fraction = df[Class].value_counts()[value] / len(df[Class])
        entropy += -fraction * np.log2(fraction)
    return entropy

In [4]:
# Calculate Entropy by attribute
def findEntropyAttribute(df,attribute):
    #print('Calculating entropy by attribute..........')  
    #print(f'df (findEntropy): \n{df}')
    #print('attribute (findEntropyAttribute): ', attribute) 
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name

    target_variables = df[Class].unique() #This gives all 'Yes' and 'No'
    variables = df[attribute].unique()    #This gives different features in that attribute (like 'Hot','Cold' in Temperature)

    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute] == variable][df[Class] == target_variable])
            den = len(df[attribute][df[attribute] == variable])
            fraction = num / (den+eps)
            entropy += -fraction * log(fraction+eps)
        fraction2 = den / len(df)
        entropy2 += -fraction2 * entropy

    return abs(entropy2)

In [5]:
# Calculate information gain and return the best splitting node (feature)
def infoGain(df):
    #print('Calculate information gain ..........')
    #print(f'df (findEntropy): \n{df}')
    IG = []
    for key in df.keys()[:-1]:
        IG.append(findEntropy(df) - findEntropyAttribute(df,key))

    return df.keys()[:-1][np.argmax(IG)]

In [6]:
def giniImpurity2(valueCounts):
    #print('Calculating gini impurity ..........')
    #print(f'df (findEntropy): \\n{df}')
    #print('valueCounts.keys(): ', valueCounts.keys())
    n = valueCounts.sum()
    p_sum = 0
    for key in valueCounts.keys():
        p_sum = p_sum  +  (valueCounts[key] / n ) * (valueCounts[key] / n )
        gini = 1 - p_sum

    return gini

In [7]:
# Calculating  gini impurity for the attiributes
def giniSplitAtt2(df, attName):
    #print('Calculating gini impurity by attribute..........')
    #print(f'df (giniSplitAtt2): \\n{df}')
    #print('attName (giniSplitAtt2): ', attName)
    attValues = df[attName].value_counts()
    gini_A = 0
    for key in attValues.keys():
        dfKey = df[className][df[attName] == key].value_counts()
        numOfKey = attValues[key]
        n = df.shape[0]
        gini_A = gini_A + (( numOfKey / n) * giniImpurity2(dfKey))

    return gini_A

In [8]:
def giniIndex2(df, attributeNames):
    #print('Calculate gini index2 ..........')
    #print(f'df (findEntropy): \\n{df}')
    #print('attributeNames (giniIndex2): ', attributeNames)
    giniAttribute = {}
    minValue = sys.maxsize
    for key in attributeNames:
        #print('=====================================> key (giniIndex2): ', key)
        giniAttribute[key] = giniSplitAtt2(df, key)
        if giniAttribute[key] < minValue:
            minValue = giniAttribute[key]
            selectedAttribute = key
        #print(f'Gini for {key} is {giniAttribute[key]:.3f}')
    minValue = min(giniAttribute.values())
    #selectedAttribute = min(giniAttribute.keys())
    #print('^^^^^^^^^^^^^^^^ minValue (giniIndex2): ', minValue)
    #print('^^^^^^^^^^^^^^^^ giniIndex2 methods is returning (giniIndex2):', selectedAttribute)
    return selectedAttribute

In [9]:
def getSubtable(df, node, value):
  return df[df[node] == value].reset_index(drop=True)

In [10]:
print(findEntropy(df))

0.9402859586706311


In [11]:
print(findEntropyAttribute(df,'age'))

0.6935361388961914


In [12]:
print(findEntropyAttribute(df,'income'))

0.9110633930116756


In [13]:
print(findEntropyAttribute(df,'student'))

0.7884504573082889


In [14]:
print(findEntropyAttribute(df,'creditRating'))

0.892158928262361


In [15]:
print(findEntropyAttribute(df,'buysComputer'))

3.2034265038149176e-16


In [16]:
print(infoGain(df))

age


In [None]:
def buildTree(df,model,tree=None): 
    # print('@@@@@@@@@@@@@@@@@ Building a classification tree................................')
    # print(f'DataFrame: \n{df}')
    # print('tree (buildTree): ', tree)
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    # print('Class (buildTree): ', Class)
    #Here we build our decision tree

    #Get attribute with maximum information gain
    #print('model (buildTree): infoGain')

    if model == 'infoGain':
        #print('Calling infoGain(df)')
        node = infoGain(df)
    else:
        #print('Calling giniIndex2')
        node = giniIndex2(df, attName)
       
    # print('node (buildTree): ', node)

    #Get distinct value of that attribute e.g Salary is node and Low,Med and High are values
    attValueBT = np.unique(df[node])
    # print('attValue (buildTree): ', attValueBT)
    #Create an empty dictionary to create tree    
    if tree is None:                    
        tree = {}
        tree[node] = {}
    
    #We make loop to construct a tree by calling this function recursively. 
    #In this we check if the subset is pure and stops if it is pure. 

    for value in attValueBT:
        # print('value (buildTree): ', value)
        subtable = getSubtable(df,node,value)

        clValue,counts = np.unique(subtable[className],return_counts=True)

        if len(counts) == 1: # Checking purity of subset
            # print('##########################################********** Recursive call 1 **********')
            # print('node (buildTree): ', node)
            # print('value (buildTree): ', tree)
            tree[node][value] = clValue[0]                                                    
        else:
            # print('##########################################********** Recursive call 2 **********')
            # print('node (buildTree): ', node)
            # print('value (buildTree): ', tree)
            # print(f'subtable (buildTree): \n{subtable}')
            tree[node][value] = buildTree(subtable, model) # Calling the function recursively 
    # print('############################################ tree to return (buildTree) ----->: ', tree)           
    return tree

In [None]:
import pprint
className = 'buysComputer'
#className = 'creditRating'
print('Target Class: ', className)
model = 'gini'
#model = 'infoGain'
t=buildTree(df, model)
pprint.pprint(t)

In [None]:
# Calculating  gini impurity for the attiributes
def gini_split_a(attribute_name):
    attribute_values = df[attribute_name].value_counts()
    gini_A = 0 
    # print('class_name: ', className)
    # print('attribute_values: ', attribute_values)
    for key in attribute_values.keys():
        df_k = df[className][df[attribute_name] == key].value_counts()
        n_k = attribute_values[key]
        n = df.shape[0]
        gini_A = gini_A + (( n_k / n) * giniImpurity2(df_k))
    return gini_A

attribute_names =  ['age', 'income', 'student', 'creditRating']
gini_attiribute ={}
for key in attribute_names:
    gini_attiribute[key] = gini_split_a(key)
    print(f'Gini for {key} is {gini_attiribute[key]:.3f}')

In [None]:
# Compute Gini gain values to find the best split
# An attribute has maximum Gini gain is selected for splitting.

min_value = min(gini_attiribute.values())
print('The minimum value of Gini Impurity : {0:.3} '.format(min_value))
print('The maximum value of Gini Gain     : {0:.3} '.format(1-min_value))

selected_attribute = min(gini_attiribute.keys())
print('The selected attiribute is: ', selected_attribute)