In [74]:
import pandas as pd
import numpy as np

In [75]:
df = pd.read_csv('data.csv')
print(df)

        age  income student credit_rating buys_computer
0      <=30    high      no          fair            no
1      <=30    high      no     excellent            no
2   31...40    high      no          fair           yes
3       >40  medium      no          fair           yes
4       >40     low     yes          fair           yes
5       >40     low     yes     excellent            no
6   31...40     low     yes     excellent           yes
7      <=30  medium      no          fair            no
8      <=30     low     yes          fair           yes
9       >40  medium     yes          fair           yes
10     <=30  medium     yes     excellent           yes
11  31...40  medium      no     excellent           yes
12  31...40    high     yes          fair           yes
13      >40  medium      no     excellent            no


In [76]:
def entropy(df: pd.DataFrame):
    values, counts = np.unique(df, return_counts=True)
    totalCount = np.sum(counts)
    entropies = [-(counts[i] / totalCount) * np.log2(counts[i] / totalCount) for i in range(len(counts))]
    
    return np.sum(entropies)

entropy(df['buys_computer'])

0.9402859586706311

In [77]:
def information_gain(df: pd.DataFrame, split_attribute_column, target_column):
    total_entropy = entropy(df[target_column])
    split_df = df[split_attribute_column]
    values, counts = np. unique(split_df, return_counts=True)
    total_count = np.sum(counts)
    weighted_entropies = []
    
    for i in range(len(counts)):
        proportions = counts[i] / total_count
        individual_values = df.where(df[split_attribute_column] == values[i]).dropna()[target_column]
        individual_value_entropy = entropy(individual_values)
        weighted_entropy = proportions * individual_value_entropy
        weighted_entropies.append(weighted_entropy)
        
    return total_entropy - np.sum(weighted_entropies)

information_gain(df, 'age', 'buys_computer')

0.24674981977443933

In [78]:
def get_best_feature(df, target_column):
    features = df.columns[:-1]
    information_gains = [information_gain(df, feature, target_column) for feature in features]
    highest_gain_index = np.argmax(information_gains)
    return features[highest_gain_index], information_gains[highest_gain_index] 

get_best_feature(df, 'buys_computer')

('age', 0.24674981977443933)

In [79]:
best_feature, best_feature_score = get_best_feature(df, 'buys_computer')
print(df[best_feature])
for value in np.unique(df[best_feature]):
    print(df.where(df[best_feature] == value).dropna())
    pass

0        <=30
1        <=30
2     31...40
3         >40
4         >40
5         >40
6     31...40
7        <=30
8        <=30
9         >40
10       <=30
11    31...40
12    31...40
13        >40
Name: age, dtype: object
        age  income student credit_rating buys_computer
2   31...40    high      no          fair           yes
6   31...40     low     yes     excellent           yes
11  31...40  medium      no     excellent           yes
12  31...40    high     yes          fair           yes
     age  income student credit_rating buys_computer
0   <=30    high      no          fair            no
1   <=30    high      no     excellent            no
7   <=30  medium      no          fair            no
8   <=30     low     yes          fair           yes
10  <=30  medium     yes     excellent           yes
    age  income student credit_rating buys_computer
3   >40  medium      no          fair           yes
4   >40     low     yes          fair           yes
5   >40     low     yes  