## Finding the root node in decision tree classification based on Entropy

In [23]:
import numpy as np
import pandas as pd

In [24]:
df = pd.read_csv("data/PlayTennis.csv")
df

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [46]:
n_samples = len(df)
(p_yes, p_no) = df["play"].value_counts().values / n_samples
total_entropy = -p_yes * np.log2(p_yes) - p_no * np.log2(p_no)
print(f"total entropy is {total_entropy:.3f}")

total entropy is 0.940


In [47]:
info_gain = {}
for column in df.columns[:-1]:
    sum_entropy = 0
    for value in df[column].unique():
        n = len(df[df[column] == value])
        n_yes = len(df[(df[column] == value) & (df["play"] == "yes")])
        n_no = len(df[(df[column] == value) & (df["play"] == "no")])
        p_yes = n_yes / n
        p_no = n_no / n
        if (p_no < 1e-9) or (p_yes < 1e-9):
            entropy = 0
        else:
            entropy = -p_yes * np.log2(p_yes) - p_no * np.log2(p_no)
        sum_entropy += entropy * (n / n_samples)
    gain = total_entropy - sum_entropy
    info_gain[column] = (sum_entropy, gain)

for k, v in info_gain.items():
    print(f"{k:10}: info: {v[0]:.3f}  |  gain: {v[1]:.3f}")

root_node = max(info_gain, key=lambda x: x[1])
max_gain = info_gain[root_node][1]
print(f"\nOur root node is {root_node!r} with max gain {max_gain:.3f}")

outlook   : info: 0.694  |  gain: 0.247
temp      : info: 0.911  |  gain: 0.029
humidity  : info: 0.788  |  gain: 0.152
windy     : info: 0.892  |  gain: 0.048

Our root node is 'outlook' with max gain 0.247
