In [1]:
import pandas as pd
import math

In [2]:
df = pd.read_csv('golf.csv')
df = df.drop('Day', axis=1)
df.head()

Unnamed: 0,Temperature,Outlook,Humidity,Windy,Golf
0,hot,sunny,high,False,no
1,hot,sunny,high,True,no
2,hot,overcast,high,False,yes
3,cool,rain,normal,False,yes
4,cool,overcast,normal,True,yes


In [3]:
def countData(df,column):
    countAll = df[column].count()
    countAllYes = (df[column] == 'yes').sum() 
    return countAll, countAllYes

# Calculate Information Gain

In [4]:
# a => Number of all objects; p => Number of true objects
def entropy (a,p):
    if (a == p):
        return 0
    q = a - p
    entropy = (-(p/a))*math.log((p/a),2)-(q/a)*math.log((q/a),2)
    return entropy

In [5]:
def informationGain (df, column, attributes, label):
    overallData = countData(df, label)
    entropyAll = entropy (overallData[0], overallData[1])
    
    data = []
    gain = entropyAll
    
    # Filter and count
    for a in attributes:
        data.append(countData(df[df[column] == a], label)) 
    # Calculate Gain
    for d in data:
        gain = gain-(d[0]/overallData[0])*entropy(d[0], d[1])
    return gain

In [6]:
# It would be ideal, if the features would also be determined automatically
print("Gain(Outlook): ", informationGain(df, 'Outlook', ['rain', 'sunny', 'overcast'], 'Golf'))
print("Gain(Temperature): ", informationGain(df, 'Temperature', ['hot', 'cool', 'mild'], 'Golf'))
print("Gain(Humidity): ", informationGain(df, 'Humidity', ['high', 'normal'], 'Golf'))
print("Gain(Windy): ", informationGain(df, 'Windy', [True, False], 'Golf'))

Gain(Outlook):  0.2467498197744391
Gain(Temperature):  0.029222565658954647
Gain(Humidity):  0.15183550136234136
Gain(Windy):  0.04812703040826927


##### Choose the feature with the highest value => Outlook 

# Calculate Gini Impurity

In [7]:
def impurity (a, p):
    q = a - p
    imp = (p/a)*(1-(p/a))+(q/a)*(1-(q/a))
    return imp

In [8]:
def giniGain (df, column, attributes, label):
    overallData = countData(df, label)
    impurityAll = impurity(overallData[0], overallData[1])
    
    data = []
    gain = impurityAll
    
    # Filter and count
    for a in attributes:
        data.append(countData(df[df[column] == a], label)) 

    # Calculate Gain
    for d in data:
        gain = gain-(d[0]/overallData[0])*impurity(d[0], d[1])
    return gain

In [9]:
# It would be ideal, if the features would also be determined automatically
print("Gain(Outlook): ", giniGain(df, 'Outlook', ['rain', 'sunny', 'overcast'], 'Golf'))
print("Gain(Temperature): ", giniGain(df, 'Temperature', ['hot', 'cool', 'mild'], 'Golf'))
print("Gain(Humidity): ", giniGain(df, 'Humidity', ['high', 'normal'], 'Golf'))
print("Gain(Windy): ", giniGain(df, 'Windy', [True, False], 'Golf'))

Gain(Outlook):  0.11632653061224485
Gain(Temperature):  0.018707482993197244
Gain(Humidity):  0.09183673469387749
Gain(Windy):  0.030612244897959162


##### Choose the feature with the highest value => Outlook 