In [142]:

import pandas as pd
import numpy as np
from graphviz import Digraph

In [143]:
src = "https://saedsayad.com/datasets/weather_nominal.csv"

In [144]:
golf_data = pd.read_csv(src)

In [145]:
golf_data

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play golf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
9,Sunny,Mild,Normal,False,Yes


In [146]:
from math import log

def entropy(*probs):
  """Calculate information entropy"""
  try:
    total = sum(probs)
    return sum([-p / total * log(p / total, 2) for p in probs])
  except:
    return 0

print(entropy(100,100), entropy(2, 1), entropy(2, 2))

1.0 0.9182958340544896 1.0


In [147]:
golf_data['Play golf'].value_counts()

Yes    9
No     5
Name: Play golf, dtype: int64

In [148]:
# entropy(Yes, No)
entropy(9, 5)

0.9402859586706309

In [149]:
golf_data.groupby('Outlook')['Play golf'].value_counts()


Outlook   Play golf
Overcast  Yes          4
Rainy     No           3
          Yes          2
Sunny     Yes          3
          No           2
Name: Play golf, dtype: int64

In [150]:
pd.crosstab(golf_data['Outlook'], golf_data['Play golf'])

Play golf,No,Yes
Outlook,Unnamed: 1_level_1,Unnamed: 2_level_1
Overcast,0,4
Rainy,3,2
Sunny,2,3


In [151]:
0, entropy(3, 2), entropy(2, 3)

(0, 0.9709505944546686, 0.9709505944546686)

In [152]:
golf_data['Outlook'].value_counts()

Rainy       5
Sunny       5
Overcast    4
Name: Outlook, dtype: int64

In [153]:
# H(T,outlook) = P(sunny)*H(sunny) + P(overcast)* H(overcast)+ P(rainy)* H(rainy)
H = 5/14 * entropy(2, 3) + 0 + 5/14 * entropy(3, 2)
H

0.6935361388961918

In [154]:
#G(outlook)= H(T) − H(T,outlook)
G = 0.94 - 0.69
G

0.25

Start building a tree with the feature with the largest information gain: outlook

A branch with entropy 0 is a leaf node: overcast

Other branches must be spliited using other features

In [155]:
tree = Digraph()

In [156]:
tree.edge("outlook", "sunny")
tree.edge("outlook", "overcast")
tree.edge("outlook", "rainy")

tree.edge("overcast", "yes")


In [157]:
#set it in web
print(tree)

digraph {
	outlook -> sunny
	outlook -> overcast
	outlook -> rainy
	overcast -> yes
}


Next branch

In [158]:
golf_data.loc[golf_data['Outlook'] == "Sunny"]

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play golf
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
9,Sunny,Mild,Normal,False,Yes
13,Sunny,Mild,High,True,No


In general, one should calculate information gain for each feature for this subset
In this case it is clear that we can take windy

In [159]:
tree.edge("sunny", "windy")

tree.edge("windy", "false")
tree.edge("windy", "true")

tree.edge("false", "yes")
tree.edge("true", "no")
print(tree)

digraph {
	outlook -> sunny
	outlook -> overcast
	outlook -> rainy
	overcast -> yes
	sunny -> windy
	windy -> false
	windy -> true
	false -> yes
	true -> no
}


In [160]:
golf_data.loc[golf_data['Outlook'] == "Rainy"]

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play golf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
10,Rainy,Mild,Normal,True,Yes


In [161]:
tree.edge("rainy", "humidity")
tree.edge("humidity", "high")
tree.edge("humidity", "normal")
tree.edge("normal", "yes")
tree.edge("high", "no")

print(tree)

digraph {
	outlook -> sunny
	outlook -> overcast
	outlook -> rainy
	overcast -> yes
	sunny -> windy
	windy -> false
	windy -> true
	false -> yes
	true -> no
	rainy -> humidity
	humidity -> high
	humidity -> normal
	normal -> yes
	high -> no
}
