In [2]:
import numpy as np
import pandas as pd

In [3]:
sample_data = pd.read_csv('sample_data.csv')
sample_data.shape

(1000, 4)

In [4]:
sample_data.head()

Unnamed: 0,Gender,Age_less_35,JobRole,Attrition
0,Male,True,Laboratory Technician,0
1,Male,False,Sales Executive,1
2,Male,True,Sales Representative,1
3,Female,False,Healthcare Representative,0
4,Male,True,Sales Executive,0


In [6]:
sample_data['Attrition'].value_counts(normalize=True)

0    0.831
1    0.169
Name: Attrition, dtype: float64

In [8]:
-(0.831*np.log2(0.831) + 0.169*np.log2(0.169))

0.6554120818442417

In [19]:
def entropy(y):
    '''
    y should be pandas series.
    '''
    
    p = y.value_counts(normalize=True)    
    ent = np.sum(p*np.log2(p + 1e-6))
    return - ent

In [48]:
def gini(y):
    '''
    y should be pandas series.
    '''
    
    p = y.value_counts(normalize=True)    
    gini = 1- np.sum(p**2)
    return gini

In [20]:
entropy(sample_data['Attrition'])

0.6554091964592962

In [21]:
sample_data['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [28]:
entropy(sample_data['Attrition'][sample_data['Gender'] =='Male'])

0.6568511990625896

In [29]:
def weighted_entropy(feature, y):
    
    '''
    feature -> gender/age<35 yes or no
    y -> series 
    '''
    
    categories = feature.unique()
    
    weighted_ent = 0
    
    for category in categories:
        y_category = y[feature == category]
        
        entropy_category = entropy(y_category)
        
        weighted_ent += (len(y_category)/len(y))* entropy_category
        
    return weighted_ent

In [31]:
weighted_entropy(sample_data['Gender'], sample_data['Attrition'])

0.6554058987688518

In [32]:
weighted_entropy(sample_data['Age_less_35'], sample_data['Attrition'])

0.6339461315402246

In [33]:
def information_gain(feature, y):
    parent = entropy(y)
    child_entropy = weighted_entropy(feature, y)

    ig = parent - child_entropy
    
    return ig

In [34]:
information_gain(sample_data['Gender'], sample_data['Attrition'])

3.2976904443815513e-06

In [35]:
information_gain(sample_data['Age_less_35'], sample_data['Attrition'])

0.021463064919071573

In [43]:
for feature in sample_data.columns[:-1]:
    print(f'Information Gain for feature {feature} is :\
    {information_gain(sample_data[feature],sample_data.Attrition).round(5)}')
    
    

Information Gain for feature Gender is :    0.0
Information Gain for feature Age_less_35 is :    0.02146
Information Gain for feature JobRole is :    0.0543


In [45]:
1- (0.5**2 + 0.5**2)

0.5

In [46]:
1- (0**2 + 1**2)

0

In [47]:
1- (0.75**2 + 0.25**2)

0.375