In [9]:
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
from numpy import log2 as log
from pprint import pprint

In [10]:
df = pd.read_csv('play_tennis.csv')
df.drop('day', axis=1, inplace=True)
df

Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [41]:
def calculateEntropy(df):
    
    label = df.columns[-1] # class column, 'play'
    entropy = 0
    values = df[label].unique() # yes and no values
    
    # loop for calculating probability of yes and no
    for value in values:
        probability = len(df[df[label] == value])/len(df[label]) # probability of the value, yes/no
        entropy += -probability*np.log(probability)
        
    return entropy

In [50]:
def calculateAttributeEntropy(df, attribute):
    
    label = df.columns[-1]
    target_variables = df[label].unique() # yes and no values
    values = df[attribute].unique() # unique values of that attribute
    entropy2 = 0
    
    # loop to iterate over the attribute values
    for value in values:
        entropy = 0
        # loop to calculate corresponding to yes and no values
        for target_variable in target_variables:
            num = len(df[(df[attribute] == value) & (df[label] == target_variable)]) # value against target
            den = len(df[df[attribute]==value])
            fraction = num/(den+eps) # adding eps to avoid zero error
            entropy += -fraction*log(fraction+eps) # adding entropies
            
        fraction2 = den/len(df)
        entropy2 += -fraction2*entropy
        
    return abs(entropy2)

In [51]:
def maxEntropy(df):
    
    informationGain = []
    for attribute in df.columns[:-1]:
        ig = calculateEntropy(df)-calculateAttributeEntropy(df, attribute) # parent entropy minus attribute one
        informationGain.append(ig)
        
    return df.columns[:-1][np.argmax(informationGain)] # returning the name of max column entropy 

In [47]:
def getSubTree(df, node, value):
    return df[df[node] == value].reset_index(drop=True) # dividing the dataFrame into sub one

In [52]:
def decisionTree(df, tree = None): 
    
    node = maxEntropy(df)    
    values = np.unique(df[node]) # unique values of the columns with max entropy
    
    # creating dictionary
    if tree == None:
        tree = {}
        tree[node] = {}

    for value in values:        
        subtree = getSubTree(df, node, value)
        
        for cols in subtree.columns:
            
            clValue, counts = np.unique(subtree[cols], return_counts=True) # value counts against target

            if len(counts)==1:
                tree[node][value] = clValue[0] # assigning value in the dictionary
            else:        
                tree[node][value] = decisionTree(subtree) # recursively passing the child node as parent node
                   
    return tree

In [53]:
t = decisionTree(df)
pprint(t)

{'outlook': {'Overcast': 'Yes',
             'Rain': {'wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'humidity': {'High': 'No', 'Normal': 'Yes'}}}}
