In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [62]:
def entropy_of(p):
    return np.sum([-i*np.log(i) for i in p])

In [3]:
def probabilities_for(items):
    c = Counter(items)
    return [v/len(items) for _, v in c.items()]

In [4]:
df = pd.read_csv('data/tennis.tsv', delimiter='\t', header=None)

In [5]:
df.columns = ['Day', 'Outlook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis']

In [6]:
df['PlayTennis'].map({'No': 0, 'Yes': 1}).value_counts()

1    9
0    5
Name: PlayTennis, dtype: int64

In [7]:
probabilities_for(df['PlayTennis'])

[0.35714285714285715, 0.6428571428571429]

In [19]:
entropy_of(probabilities_for(df.loc[df['Wind'] == 'Strong', 'PlayTennis']))

1.0

In [9]:
df['PlayTennis'].value_counts().items()

<zip at 0x7f35b2f16d70>

In [10]:
probabilities_for(df['Wind'])

[0.5714285714285714, 0.42857142857142855]

In [20]:
entropy_of(probabilities_for(df['PlayTennis']))

0.9402859586706311

In [None]:
H(D) - [H(D | wind = weak)*P(wind = weak) + H(D | wind = strong)*P(wind=strong)]

In [29]:
df['Wind'].value_counts()

Weak      8
Strong    6
Name: Wind, dtype: int64

In [11]:
def information_gain_for(data, col, out_col):
    h_out = entropy_of(probabilities_for(data[out_col]))
    summed = 0
    c = Counter(data[col])
    for key, val in c.items():
        p = val/len(data)
        h = entropy_of(
            probabilities_for(data.loc[data[col] == key, out_col])
        )
        summed += p * h
        
    return h_out - summed

In [12]:
information_gain_for(df, 'Wind', 'PlayTennis')

0.04812703040826949

In [64]:
for col in df.columns[:-1]:
    print(
        f'{col} has information gain of {information_gain_for(df, col, "PlayTennis")}'
)

Day has information gain of 0.6517565611726531
Outlook has information gain of 0.17103394188032706
Temperature has information gain of 0.02025553899523236
Humidity has information gain of 0.10524434967821283
Wind has information gain of 0.033359115436214726


In [40]:
from scipy.stats import entropy

In [63]:
entropy_of(probabilities_for(df['Outlook']))

1.0933747175566468

In [65]:
probabilities_for(df['Outlook'])

[0.35714285714285715, 0.2857142857142857, 0.35714285714285715]