In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {
    "outlook" : ['sunny','sunny','overcast','rainy','rainy','rainy','overcast','sunny','sunny','rainy','sunny','overcast','overcast','rainy'],
    "temp" : ['hot','hot','hot','mild','cool','cool','cool','mild','cool','mild','mild','mild','hot','mild'],
    'humidity' : ['high','high','high','high','normal','normal','normal','high','normal','normal','normal','high','normal','high'],
    'windy' : ['false','true','false','false','false','true','true','false','false','false','true','true','false','true'],
    'play' : ['no','no','yes','yes','yes','no','yes','no','yes','yes','yes','yes','yes','no']
}

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [5]:
pd.value_counts(df['play'])

yes    9
no     5
Name: play, dtype: int64

In [6]:
values = pd.unique(df['play'])
entropy = 0
n = len(df)
for value in values:
    x = df['play'].value_counts()[value] / n
    entropy+= -x * np.log2(x)    


In [7]:
entropy

0.9402859586706311

In [8]:
df['outlook'].unique()

array(['sunny', 'overcast', 'rainy'], dtype=object)

In [9]:
df['outlook'].value_counts()

sunny       5
rainy       5
overcast    4
Name: outlook, dtype: int64

In [10]:
df.groupby('outlook')['play'].value_counts()

outlook   play
overcast  yes     4
rainy     yes     3
          no      2
sunny     no      3
          yes     2
Name: play, dtype: int64

In [11]:
df[df['outlook'] == 'overcast']

Unnamed: 0,outlook,temp,humidity,windy,play
2,overcast,hot,high,False,yes
6,overcast,cool,normal,True,yes
11,overcast,mild,high,True,yes
12,overcast,hot,normal,False,yes


In [12]:
df[df['outlook'] == 'rainy'][df['play'] == 'no']['outlook']

  df[df['outlook'] == 'rainy'][df['play'] == 'no']['outlook']


5     rainy
13    rainy
Name: outlook, dtype: object

In [13]:
len(df[df['outlook'] == 'rainy'][df['play'] == 'no']['outlook'])

  len(df[df['outlook'] == 'rainy'][df['play'] == 'no']['outlook'])


2

In [14]:
def calc_avg(df , feature):
    variables = pd.unique(df[feature])
    target_values = pd.unique(df['play'])
    avg = 0
    for var in variables:
        entropy = 0
        for target in target_values:
            x = len(df[df[feature] == var][df['play'] == target][feature])
            y = len(df[df[feature] == var][feature])
            e = x / (y + np.finfo(float).eps)
            entropy += -e * np.log2(e+np.finfo(float).eps)
        attr  = y / len(df)
        avg += attr * entropy
    return avg
        
    

In [15]:
calc_avg(df, 'outlook')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.6935361388961914

In [16]:
calc_avg(df, 'temp')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.9110633930116756

In [17]:
calc_avg(df,'humidity')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.7884504573082889

In [18]:
calc_avg(df,'windy')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.892158928262361

In [19]:
gain_outlook = entropy - calc_avg(df,'outlook')

  x = len(df[df[feature] == var][df['play'] == target][feature])


In [20]:
gain_outlook

0.24674981977443977

In [21]:
gain_temp = entropy - calc_avg(df,'temp')

  x = len(df[df[feature] == var][df['play'] == target][feature])


In [22]:
gain_temp

0.029222565658955535

In [23]:
gain_hum = entropy - calc_avg(df,'humidity')

  x = len(df[df[feature] == var][df['play'] == target][feature])


In [24]:
gain_hum

0.15183550136234225

In [25]:
gain_wind = entropy - calc_avg(df,'windy')

  x = len(df[df[feature] == var][df['play'] == target][feature])


In [26]:
gain_wind

0.048127030408270155

In [31]:
entropy_attrs = {}
for i in range(len(df.columns) - 1):
    entropy_attrs[df.columns[i]] = calc_avg(df, df.columns[i])

  x = len(df[df[feature] == var][df['play'] == target][feature])


In [33]:
entropy_attrs


{'outlook': 0.6935361388961914,
 'temp': 0.9110633930116756,
 'humidity': 0.7884504573082889,
 'windy': 0.892158928262361}

In [34]:
gain = {}
for key in entropy_attrs:
    gain[key] = entropy - entropy_attrs[key]

In [35]:
gain

{'outlook': 0.24674981977443977,
 'temp': 0.029222565658955535,
 'humidity': 0.15183550136234225,
 'windy': 0.048127030408270155}

In [41]:
max(gain)

'windy'

In [None]:
def calc_avg(df , feature):
    variables = pd.unique(df[feature])
    target_values = pd.unique(df['play'])
    avg = 0
    for var in variables:
        entropy = 0
        for target in target_values:
            x = len(df[df[feature] == var][df['play'] == target][feature])
            y = len(df[df[feature] == var][feature])
            e = x / (y + np.finfo(float).eps)
            entropy += -e * np.log2(e+np.finfo(float).eps)
        attr  = y / len(df)
        avg += attr * entropy
    return avg
 
    
def calc_entropy():
    values = pd.unique(df['play'])
    entropy = 0
    n = len(df)
    for value in values:
        x = df['play'].value_counts()[value] / n
        entropy+= -x * np.log2(x)    
    return entropy



       

In [42]:
max(gain , key = gain.get)

'outlook'

In [45]:
node = 'outlook'
val = 'overcast'

df[df[node] == val].reset_index(drop=True)

Unnamed: 0,outlook,temp,humidity,windy,play
0,overcast,hot,high,False,yes
1,overcast,cool,normal,True,yes
2,overcast,mild,high,True,yes
3,overcast,hot,normal,False,yes
