In [1]:
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
from numpy import log2 as log
from pprint import pprint

In [2]:
df = pd.read_csv('play_tennis.csv')
df.drop('day', axis=1, inplace=True)
df

Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,100,High,Weak,No
1,Sunny,90,High,Strong,No
2,Overcast,80,High,Weak,Yes
3,Rain,70,High,Weak,Yes
4,Rain,60,Normal,Weak,Yes
5,Rain,50,Normal,Strong,No
6,Overcast,40,Normal,Strong,Yes
7,Sunny,70,High,Weak,No
8,Sunny,60,Normal,Weak,Yes
9,Rain,70,Normal,Weak,Yes


In [3]:
def calculateEntropy(df):
    
    label = df.columns[-1]
    entropy = 0
    values = df[label].unique()
    for value in values:
        probability = len(df[df[label] == value])/len(df[label])
        entropy += -probability*log(probability)
        
    return entropy

In [4]:
def calculateAttributeEntropy(df, attribute):
    
    label = df.columns[-1]
    target_variables = df[label].unique()
    values = df[attribute].unique()
    entropy2 = 0
    
    for value in values:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[(df[attribute] == value) & (df[label] == target_variable)])
            den = len(df[df[attribute] == value])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
            
        fraction2 = den/len(df)
        entropy2 += -fraction2*entropy
        
    return abs(entropy2)

In [5]:
def calculateAttributeEntropyCont(df, attribute):
    
    label = df.columns[-1]
    target_variables = df[label].unique()
    values = []
    entropy2 = 0
    
    temp_vals = sorted(list(set(df[attribute])))
    split_points = []
    for i in range(0, len(temp_vals)-1):
        values.append((temp_vals[i+1] + temp_vals[i])/2)
        
    for value in values:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[(df[attribute] < value) & (df[label] == target_variable)])
            den = len(df[df[attribute] < value])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
            
        fraction2 = den/len(df)
        entropy2 += -fraction2*entropy
        
    return abs(entropy2)

In [6]:
def maxEntropy(df):
    informationGain = []
    for attribute in df.columns[:-1]:
        if attribute == 'temp':
            ig = calculateEntropy(df)-calculateAttributeEntropyCont(df, attribute)
        else:
            ig = calculateEntropy(df)-calculateAttributeEntropy(df, attribute)
            
        informationGain.append(ig)
        
    return df.columns[:-1][np.argmax(informationGain)]

In [7]:
def getSubTree(df, node, value):
    return df[df[node] == value].reset_index(drop=True)

In [8]:
def decisionTree(df, tree = None): 
    
    node = maxEntropy(df)    
    values = np.unique(df[node])
    
    # creating dictionary
    if tree == None:
        tree = {}
        tree[node] = {}

    for value in values:        
        subtree = getSubTree(df, node, value)
        for cols in subtree.columns:
            clValue, counts = np.unique(subtree[cols], return_counts=True)

            if len(counts)==1:
                tree[node][value] = clValue[0]
            else:        
                tree[node][value] = decisionTree(subtree)
                   
    return tree

In [9]:
t = decisionTree(df)
pprint(t)

{'temp': {40: 'Yes',
          50: 'No',
          60: 'Yes',
          70: {'outlook': {'Rain': {'wind': {'Strong': 'No', 'Weak': 'Yes'}},
                           'Sunny': 'No'}},
          75: 'Yes',
          80: 'Yes',
          90: {'outlook': {'Overcast': 'Yes', 'Sunny': 'No'}},
          100: 'No'}}
