In [1]:
import numpy as np
import pandas as pd

In [26]:
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast',
                'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild',
                    'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal',
                 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong',
             'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes',
             'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)



In [27]:
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_val = np.sum(
        [-(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts))
         for i in range(len(elements))]
    )
    return entropy_val

In [28]:
def info_gain(data, split_attribute_name, target_attribute_name):
    total_entropy = entropy(data[target_attribute_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    weighted_entropy = np.sum([
        (counts[i] / np.sum(counts)) *
        entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_attribute_name])
        for i in range(len(vals))
    ])
    info_gain_value = total_entropy - weighted_entropy
    return info_gain_value

In [29]:
def build_tree(data, originaldata, features, target_attribute_name="Play", parent_node_class=None):

    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]

    elif len(data) == 0:
        unique_vals, counts = np.unique(originaldata[target_attribute_name], return_counts=True)
        return unique_vals[np.argmax(counts)]


    elif len(features) == 0:
        return parent_node_class


    else:
        unique_vals, counts = np.unique(data[target_attribute_name], return_counts=True)
        parent_node_class = unique_vals[np.argmax(counts)]


        item_values = [info_gain(data, feature, target_attribute_name) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]

        tree = {best_feature: {}}

        remaining_features = [i for i in features if i != best_feature]

        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = build_tree(sub_data, data, remaining_features, target_attribute_name, parent_node_class)
            tree[best_feature][value] = subtree

        return tree

In [30]:
features = df.columns[:-1]
tree = build_tree(df, df, features, target_attribute_name="Play")
print("\n===== HASIL FINAL TREE =====\n")
print(tree)


===== HASIL FINAL TREE =====

{'Outlook': {'Overcast': 'Yes', 'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}}, 'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}
